Objective: 1) use avatar to create a duplicated data set and compare the performance of original and avatar dataset in a cox model. 2) Evalaute the variability for a given avatar with bootstraping 3) Evaluate variabiltiy between different Avatar using different seed 4) Evaluate effect of data augmentation (X4) 5) Evaluate Survtvae in comaprison to Avatar 6) Evaluate Survctgan in comaprison to Avatar

In this anlaysis, the covariates selected after bootstraping bootstepAIC each synhtetic dataset are used for the analyses of inter dataset variability

The difference with v3 is that here with fit the multivariate model for the evaluation of knn=x instead of fitting only for haplotype and maping on knn value : that means that we have ot run separately for the different knn value the script with hteir own mulitvariate models

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.0 
## ✔ dials        1.2.1      ✔ tune         1.2.0 
## ✔ infer        1.0.6      ✔ workflows    1.1.4 
## ✔ modeldata    1.3.0      ✔ workflowsets 1.0.1 
## ✔ parsnip      1.2.0      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(FNN)
library(survival)
library(survminer)
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## 
## The following object is masked from 'package:survival':
## 
##     myeloma
library(corrplot)
## corrplot 0.92 loaded
library(ggcorrplot)
library(DataExplorer)
library(patchwork)
library(tableone)
library(boot)
## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:survival':
## 
##     aml
library(bootStepAIC)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:patchwork':
## 
##     area
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(conflicted)
conflict_prefer("select", "dplyr")
## [conflicted] Will prefer dplyr::select over any other package.
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.

Load the data

library(readr)
original <- read_delim("td_dirc_perte_greffon.txt", 
    delim = "\t", escape_double = FALSE, 
    trim_ws = TRUE) %>% 
  mutate(rejet_aigu = as.factor(rejet_aigu)
         # age_r =  scale(age_r),
         # age_d =  scale(age_d),
         # TIF =  scale(TIF),
         ) %>% 
  mutate_if(is.character, factor) %>% 
  select(-id) %>% 
  select(haplotype:delai_event)
## Rows: 253 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): id, haplotype, cyp3A5D, sexe_r, sexe_d, CYP3A4_1B, MDR1_C1236T, MDR...
## dbl (7): age_r, age_d, rejet_aigu, TIF, event, delai_event, pente_creat
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(original)
##  haplotype   cyp3A5D       age_r       sexe_r      age_d       sexe_d 
##  autre: 97   Es : 42   Min.   :19.00   F: 97   Min.   :12.00   F: 79  
##  het  :123   NEs:211   1st Qu.:44.00   M:156   1st Qu.:25.00   M:174  
##  hom  : 33             Median :55.00           Median :40.00          
##                        Mean   :53.84           Mean   :38.49          
##                        3rd Qu.:64.00           3rd Qu.:49.00          
##                        Max.   :78.00           Max.   :73.00          
##  rejet_aigu      TIF           event          delai_event    
##  0:172      Min.   : 303   Min.   :0.00000   Min.   : 0.680  
##  1: 81      1st Qu.: 975   1st Qu.:0.00000   1st Qu.: 2.920  
##             Median :1153   Median :0.00000   Median : 5.340  
##             Mean   :1199   Mean   :0.08696   Mean   : 6.044  
##             3rd Qu.:1368   3rd Qu.:0.00000   3rd Qu.: 8.700  
##             Max.   :2580   Max.   :1.00000   Max.   :15.830
str(original)
## tibble [253 × 10] (S3: tbl_df/tbl/data.frame)
##  $ haplotype  : Factor w/ 3 levels "autre","het",..: 2 2 3 2 2 2 3 3 1 2 ...
##  $ cyp3A5D    : Factor w/ 2 levels "Es","NEs": 2 2 1 1 2 1 1 2 2 2 ...
##  $ age_r      : num [1:253] 74 64 47 42 53 62 68 77 52 58 ...
##  $ sexe_r     : Factor w/ 2 levels "F","M": 1 2 2 2 2 1 1 2 2 1 ...
##  $ age_d      : num [1:253] 52 41 23 44 30 16 46 58 26 40 ...
##  $ sexe_d     : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 1 1 2 2 ...
##  $ rejet_aigu : Factor w/ 2 levels "0","1": 2 2 1 2 2 2 1 2 1 1 ...
##  $ TIF        : num [1:253] 1020 825 1020 827 1245 ...
##  $ event      : num [1:253] 1 1 1 1 1 1 1 1 1 1 ...
##  $ delai_event: num [1:253] 10.21 5.78 3.91 3.27 6.79 ...
original1 <- original %>% mutate_if(is.factor, as.numeric)# converti en factor par ordre alhpahbetique+++
write.csv(original1, file = "original1.csv")

Avatar

In this code we will vary the seed for avatar with a fix number of knn (here=5) and bootsptrap the Cox model The goal is to extract the variability of HR for a given dataset and between different seed to obtain the overall uncertainty

run a single Avatar knn=5

We launch a single Avatar with knn=5 and a given seed and we compare the results to the original data

data_normalized <- scale(original1)
pca <- prcomp(data_normalized, scale. = FALSE)# pour selecitonner le nombre de cp rank. = 3
# Number of neighbors
k <- 5  # Adjust this based on your requirement

algorithm

pca_transformed_data <- pca$x
knn_result <- get.knn(pca_transformed_data, k)

generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
  n <- nrow(pca_transformed_data)
  avatar_weights <- matrix(nrow = n, ncol = k)
  
  for (i in 1:n) {
    # Step 1: Inverse of Distances
    distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
    inverse_distances <- 1 / distances
    
    # Step 2: Random Weights
   
    random_weights <- rexp(k, rate = 1)
    
    # Step 3: Contribution Factors
   
    shuffled_indices <- sample(k)
    contribution_factors <- 1 / (2^shuffled_indices)
    
    # Step 4: Calculate Weights
    weights <- inverse_distances * random_weights * contribution_factors
    
    # Step 5: Normalize Weights
    normalized_weights <- weights / sum(weights)
    
    avatar_weights[i, ] <- normalized_weights
  }
  
  return(avatar_weights)
}



# Generate avatar weights
 set.seed(12)
avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)

Generation of avatar in the latent space

# Assuming pca_result, avatar_weights, knn_result$nn.index, and pca_transformed_data are already defined

# Function to generate avatars in PCA space based on weights
generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
  n <- nrow(pca_transformed_data)
  avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
  
  for (i in 1:n) {
    weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
    avatars_pca[i, ] <- colSums(weighted_avatars)
  }
  
  return(avatars_pca)
}
# Generate avatars in PCA space
avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)

Return to the initial scale

# Assuming 'aids_pca' is the PCA object and 'avatars_pca_space' contains the avatars in PCA space
# Inverse PCA transformation
inverse_pca <- function(pca_object, pca_data) {
  return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
}
avatars_original_scale <- inverse_pca(pca, avatars_pca_space)

# Assuming 'aids_data_normalized' contains the scaling attributes of the original data
# Inverse normalization (if the original data was normalized)
avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")

Transform into tibble

avatars_tibble_knn5 <- as_tibble(avatars_rescaled) %>% 
  mutate(haplotype = round(haplotype, digits=0),
         cyp3A5D = round(cyp3A5D, digits=0),
         sexe_r  = round(sexe_r , digits=0),
         sexe_d  = round(sexe_d , digits=0),
         rejet_aigu  = round(rejet_aigu , digits=0),
         event = round(event, digits=0)
         # CYP3A4_1B = round(CYP3A4_1B, digits=0),
         # MDR1_C1236T = round(MDR1_C1236T, digits=0),
         # MDR1_G2677T = round(MDR1_G2677T, digits=0),
         # MDR1_C3435T = round(MDR1_C3435T, digits=0)
  ) 

avatars_tibble_factor_knn5 <- as_tibble(avatars_rescaled) %>% 
  mutate(haplotype = round(haplotype, digits=0),
         cyp3A5D = round(cyp3A5D, digits=0),
         sexe_r  = round(sexe_r , digits=0),
         sexe_d  = round(sexe_d , digits=0),
         rejet_aigu  = round(rejet_aigu , digits=0),
         event = round(event, digits=0)
         # CYP3A4_1B = round(CYP3A4_1B, digits=0),
         # MDR1_C1236T = round(MDR1_C1236T, digits=0),
         # MDR1_G2677T = round(MDR1_G2677T, digits=0),
         # MDR1_C3435T = round(MDR1_C3435T, digits=0)
  ) %>% 
  mutate(haplotype = as.factor(haplotype),
         cyp3A5D = as.factor(cyp3A5D),
         sexe_r = as.factor(sexe_r),
         sexe_d = as.factor(sexe_d),
         # CYP3A4_1B = as.factor(CYP3A4_1B),
         # MDR1_C1236T = as.factor(MDR1_C1236T),
         # MDR1_G2677T = as.factor(MDR1_G2677T),
         # MDR1_C3435T = as.factor(MDR1_C3435T),
         rejet_aigu = as.factor(rejet_aigu))

Plot of the synthetic and original in the latent space

# Combine original and synthetic data for visualization
combined_data <- rbind(
  original1 %>% mutate(DataType = 'Original'),
  avatars_tibble_knn5 %>% mutate(DataType = 'Synthetic')
)

# Perform PCA on combined data
combined_data_normalized <- scale(combined_data[, -which(names(combined_data) %in% c("DataType", "id"))])
combined_pca <- prcomp(combined_data_normalized, scale. = FALSE)

# Extract the first two principal components
combined_pca_data <- data.frame(combined_pca$x[, 1:2])
combined_pca_data$DataType <- combined_data$DataType

# Plot PCA with color differentiation
ggplot(combined_pca_data, aes(x = PC1, y = PC2, color = DataType)) +
  geom_point(alpha = 0.8) +
  theme_minimal() +
  labs(title = "PCA Plot", x = "Principal Component 1", y = "Principal Component 2", color = "Data Type")

Comparison of the datasets

Summary of the 2 datasets

## Vector of categorical variables that need transformation
catVars <- c("haplotype", "cyp3A5D",  "sexe_r",  "sexe_d", 
"rejet_aigu", "event")
## Create a variable list.
vars <- c( "haplotype", "cyp3A5D", "age_r", "sexe_r", "age_d", "sexe_d", 
"rejet_aigu", "TIF", "event", "delai_event", "DataType")
tableOne <- CreateTableOne(vars = vars, strata = "DataType",factorVars = catVars, data = combined_data)
tableOne2<-print(tableOne, nonnormal = c( "age_r", "age_d", "TIF", "delai_event"), printToggle=F, minMax=T)
Original Synthetic p test
n 253 253
haplotype (%) 0.022
1 97 (38.3) 93 ( 36.8)
2 123 (48.6) 144 ( 56.9)
3 33 (13.0) 16 ( 6.3)
cyp3A5D = 2 (%) 211 (83.4) 217 ( 85.8) 0.538
age_r (median [range]) 55.00 [19.00, 78.00] 55.09 [24.19, 73.74] 0.487 nonnorm
sexe_r = 2 (%) 156 (61.7) 163 ( 64.4) 0.581
age_d (median [range]) 40.00 [12.00, 73.00] 39.82 [19.40, 68.49] 0.642 nonnorm
sexe_d = 2 (%) 174 (68.8) 185 ( 73.1) 0.327
rejet_aigu = 2 (%) 81 (32.0) 73 ( 28.9) 0.499
TIF (median [range]) 1153.00 [303.00, 2580.00] 1174.55 [456.35, 2362.15] 0.560 nonnorm
event = 1 (%) 22 ( 8.7) 21 ( 8.3) 1.000
delai_event (median [range]) 5.34 [0.68, 15.83] 5.36 [0.97, 14.94] 0.866 nonnorm
DataType = Synthetic (%) 0 ( 0.0) 253 (100.0) <0.001
summary(original)
##  haplotype   cyp3A5D       age_r       sexe_r      age_d       sexe_d 
##  autre: 97   Es : 42   Min.   :19.00   F: 97   Min.   :12.00   F: 79  
##  het  :123   NEs:211   1st Qu.:44.00   M:156   1st Qu.:25.00   M:174  
##  hom  : 33             Median :55.00           Median :40.00          
##                        Mean   :53.84           Mean   :38.49          
##                        3rd Qu.:64.00           3rd Qu.:49.00          
##                        Max.   :78.00           Max.   :73.00          
##  rejet_aigu      TIF           event          delai_event    
##  0:172      Min.   : 303   Min.   :0.00000   Min.   : 0.680  
##  1: 81      1st Qu.: 975   1st Qu.:0.00000   1st Qu.: 2.920  
##             Median :1153   Median :0.00000   Median : 5.340  
##             Mean   :1199   Mean   :0.08696   Mean   : 6.044  
##             3rd Qu.:1368   3rd Qu.:0.00000   3rd Qu.: 8.700  
##             Max.   :2580   Max.   :1.00000   Max.   :15.830
summary(avatars_tibble_knn5)
##    haplotype        cyp3A5D          age_r           sexe_r     
##  Min.   :1.000   Min.   :1.000   Min.   :24.19   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:45.93   1st Qu.:1.000  
##  Median :2.000   Median :2.000   Median :55.09   Median :2.000  
##  Mean   :1.696   Mean   :1.858   Mean   :53.06   Mean   :1.644  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:61.18   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :2.000   Max.   :73.74   Max.   :2.000  
##      age_d           sexe_d        rejet_aigu         TIF        
##  Min.   :19.40   Min.   :1.000   Min.   :1.000   Min.   : 456.3  
##  1st Qu.:30.60   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1053.0  
##  Median :39.82   Median :2.000   Median :1.000   Median :1174.5  
##  Mean   :39.04   Mean   :1.731   Mean   :1.289   Mean   :1192.8  
##  3rd Qu.:47.46   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1304.9  
##  Max.   :68.49   Max.   :2.000   Max.   :2.000   Max.   :2362.1  
##      event        delai_event     
##  Min.   :0.000   Min.   : 0.9749  
##  1st Qu.:0.000   1st Qu.: 3.6906  
##  Median :0.000   Median : 5.3569  
##  Mean   :0.083   Mean   : 5.7325  
##  3rd Qu.:0.000   3rd Qu.: 7.0216  
##  Max.   :1.000   Max.   :14.9361

individual data explorer

# boxplots
plot_boxplot(combined_data , by ="DataType") 

# histograms

# Function to create histogram for each continuous variable
plot_histograms <- function(data, var_name, group_var) {
  ggplot(data, aes(x = !!sym(var_name), fill = !!sym(group_var))) +
    geom_histogram(alpha = 0.5,show.legend = FALSE) +
    labs(x = var_name, y = "Count") +
    theme_minimal() +
    ggtitle(paste(var_name))
}

# Using select_if to identify continuous variables and map to apply the function
plots <- combined_data %>%
  select( -sexe_r,-sexe_d) %>% 
  select_if(is.numeric) %>%
  names() %>%
  map(~plot_histograms(combined_data, ., "DataType"))

# Optionally, print or arrange plots (e.g., using gridExtra or patchwork packages)

wrap_plots(plots)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot correlation

##Correlation Analysis
  cor_real <- cor(original1, use = "complete.obs")
  cor_synthetic <- cor(avatars_tibble_knn5, use = "complete.obs")
  
# plots
ggcorrplot(cor_real, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

# plots
ggcorrplot(cor_synthetic, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

Modele de Cox

# original
fit_original <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = original1)
## 
##   n= 253, number of events= 22 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.2112071  3.3575352  0.3463273  3.497  0.00047 ***
## cyp3A5D    -1.2323909  0.2915946  0.5567303 -2.214  0.02685 *  
## age_r      -0.0039521  0.9960557  0.0187880 -0.210  0.83339    
## sexe_r     -0.0438849  0.9570641  0.4606422 -0.095  0.92410    
## age_d       0.0360206  1.0366772  0.0203668  1.769  0.07696 .  
## sexe_d      0.2786636  1.3213627  0.5181402  0.538  0.59070    
## rejet_aigu  1.0124644  2.7523756  0.4804379  2.107  0.03508 *  
## TIF        -0.0002268  0.9997732  0.0005753 -0.394  0.69345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.3575     0.2978   1.70305    6.6193
## cyp3A5D       0.2916     3.4294   0.09792    0.8683
## age_r         0.9961     1.0040   0.96004    1.0334
## sexe_r        0.9571     1.0449   0.38801    2.3607
## age_d         1.0367     0.9646   0.99611    1.0789
## sexe_d        1.3214     0.7568   0.47861    3.6481
## rejet_aigu    2.7524     0.3633   1.07339    7.0576
## TIF           0.9998     1.0002   0.99865    1.0009
## 
## Concordance= 0.758  (se = 0.054 )
## Likelihood ratio test= 24.6  on 8 df,   p=0.002
## Wald test            = 21.09  on 8 df,   p=0.007
## Score (logrank) test = 25.84  on 8 df,   p=0.001
ggforest(fit_original)

# synthetique
fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF, data = avatars_tibble_knn5)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = avatars_tibble_knn5)
## 
##   n= 253, number of events= 21 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   2.2993957  9.9681574  0.4532528  5.073 3.91e-07 ***
## cyp3A5D    -0.4985711  0.6073979  0.7870725 -0.633   0.5264    
## age_r       0.0622015  1.0641767  0.0333837  1.863   0.0624 .  
## sexe_r      0.5921961  1.8079544  0.5394767  1.098   0.2723    
## age_d       0.0713866  1.0739964  0.0366682  1.947   0.0516 .  
## sexe_d      1.9252423  6.8568098  0.7852001  2.452   0.0142 *  
## rejet_aigu  0.9571739  2.6043260  0.5889792  1.625   0.1041    
## TIF        -0.0003885  0.9996116  0.0010378 -0.374   0.7081    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     9.9682     0.1003    4.1002    24.234
## cyp3A5D       0.6074     1.6464    0.1299     2.841
## age_r         1.0642     0.9397    0.9968     1.136
## sexe_r        1.8080     0.5531    0.6280     5.205
## age_d         1.0740     0.9311    0.9995     1.154
## sexe_d        6.8568     0.1458    1.4715    31.951
## rejet_aigu    2.6043     0.3840    0.8210     8.261
## TIF           0.9996     1.0004    0.9976     1.002
## 
## Concordance= 0.874  (se = 0.028 )
## Likelihood ratio test= 43.67  on 8 df,   p=7e-07
## Wald test            = 36.17  on 8 df,   p=2e-05
## Score (logrank) test = 44.51  on 8 df,   p=5e-07
ggforest(fit_synthetique)

BootstepAIC based on BIC

Allow to see which vairable would have been selected

Original

boot.stepAIC(fit_original, original1, B = 100, k=log(nrow(original1)))
## 
## Summary of Bootstrapping the 'stepAIC()' procedure for
## 
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = original1)
## 
## Bootstrap samples: 100 
## Direction: backward 
## Penalty: 5.53 * df
## 
## Covariates selected
##            (%)
## haplotype   95
## rejet_aigu  47
## cyp3A5D     29
## age_d       21
## sexe_d       3
## Null         2
## sexe_r       2
## TIF          2
## 
## Coefficients Sign
##            + (%) - (%)
## age_d        100     0
## haplotype    100     0
## rejet_aigu   100     0
## sexe_d       100     0
## sexe_r       100     0
## cyp3A5D        0   100
## TIF            0   100
## 
## Stat Significance
##            (%)
## age_d      100
## cyp3A5D    100
## haplotype  100
## rejet_aigu 100
## sexe_d     100
## sexe_r     100
## TIF        100
## 
## 
## The stepAIC() for the original data-set gave
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype, data = original1)
## 
##             coef exp(coef) se(coef)     z        p
## haplotype 1.2035    3.3319   0.3276 3.674 0.000239
## 
## Likelihood ratio test=14.01  on 1 df, p=0.0001822
## n= 253, number of events= 22 
## 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_r + sexe_r + 
##     age_d + sexe_d + rejet_aigu + TIF
## 
## Final Model:
## Surv(delai_event, event) ~ haplotype
## 
## 
##           Step Df    Deviance Resid. Df Resid. Dev      AIC
## 1                                    14  -24.59675 212.4446
## 2     - sexe_r  1 0.009067423        15  -24.58769 206.9203
## 3      - age_r  1 0.039034149        16  -24.54865 201.4259
## 4        - TIF  1 0.251427799        17  -24.29723 196.1439
## 5     - sexe_d  1 0.442126797        18  -23.85510 191.0527
## 6      - age_d  1 2.811491990        19  -21.04361 188.3308
## 7    - cyp3A5D  1 2.805921958        20  -18.23768 185.6033
## 8 - rejet_aigu  1 4.230950507        21  -14.00673 184.3009

synhtetic knn5

boot.stepAIC(fit_synthetique, avatars_tibble_knn5, B = 100, k=log(nrow(avatars_tibble_knn5)))
## 
## Summary of Bootstrapping the 'stepAIC()' procedure for
## 
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = avatars_tibble_knn5)
## 
## Bootstrap samples: 100 
## Direction: backward 
## Penalty: 5.53 * df
## 
## Covariates selected
##            (%)
## haplotype  100
## sexe_d      44
## age_d       33
## rejet_aigu  26
## age_r       13
## sexe_r      10
## cyp3A5D      3
## TIF          2
## 
## Coefficients Sign
##             + (%)  - (%)
## age_d      100.00   0.00
## age_r      100.00   0.00
## haplotype  100.00   0.00
## rejet_aigu 100.00   0.00
## sexe_d     100.00   0.00
## sexe_r     100.00   0.00
## cyp3A5D     33.33  66.67
## TIF          0.00 100.00
## 
## Stat Significance
##               (%)
## age_d      100.00
## age_r      100.00
## haplotype  100.00
## rejet_aigu 100.00
## sexe_r     100.00
## TIF        100.00
## sexe_d      84.09
## cyp3A5D     66.67
## 
## 
## The stepAIC() for the original data-set gave
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype, data = avatars_tibble_knn5)
## 
##              coef exp(coef) se(coef)     z        p
## haplotype  2.3545   10.5330   0.4374 5.383 7.32e-08
## 
## Likelihood ratio test=30.11  on 1 df, p=4.087e-08
## n= 253, number of events= 21 
## 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_r + sexe_r + 
##     age_d + sexe_d + rejet_aigu + TIF
## 
## Final Model:
## Surv(delai_event, event) ~ haplotype
## 
## 
##           Step Df  Deviance Resid. Df Resid. Dev      AIC
## 1                                  13  -43.66527 188.3444
## 2        - TIF  1 0.1453500        14  -43.51992 182.9564
## 3    - cyp3A5D  1 0.3324782        15  -43.18744 177.7554
## 4     - sexe_r  1 1.0935263        16  -42.09391 173.3156
## 5      - age_r  1 2.6161991        17  -39.47771 170.3984
## 6 - rejet_aigu  1 1.4420932        18  -38.03562 166.3071
## 7     - sexe_d  1 5.2900187        19  -32.74560 166.0637
## 8      - age_d  1 2.6377284        20  -30.10787 163.1681
Final model original
fit_original <- coxph(Surv(delai_event, event) ~ haplotype , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype, data = original1)
## 
##   n= 253, number of events= 22 
## 
##             coef exp(coef) se(coef)     z Pr(>|z|)    
## haplotype 1.2035    3.3319   0.3276 3.674 0.000239 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.332     0.3001     1.753     6.332
## 
## Concordance= 0.682  (se = 0.044 )
## Likelihood ratio test= 14.01  on 1 df,   p=2e-04
## Wald test            = 13.5  on 1 df,   p=2e-04
## Score (logrank) test = 14.91  on 1 df,   p=1e-04
Final model synthetic
fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype  , data = avatars_tibble_knn5)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype, data = avatars_tibble_knn5)
## 
##   n= 253, number of events= 21 
## 
##              coef exp(coef) se(coef)     z Pr(>|z|)    
## haplotype  2.3545   10.5330   0.4374 5.383 7.32e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## haplotype     10.53    0.09494     4.469     24.82
## 
## Concordance= 0.748  (se = 0.033 )
## Likelihood ratio test= 30.11  on 1 df,   p=4e-08
## Wald test            = 28.98  on 1 df,   p=7e-08
## Score (logrank) test = 28.45  on 1 df,   p=1e-07

Bootstrap of the coefficient for haplotype

Allow to define the variability range of HR for a given dataset (intra dataset variability)

# Define the Cox model
cox_model <- function(data, indices) {
  d <- data[indices,] # allows bootstrapping to sample the data
  fit <- coxph(Surv(delai_event, event) ~ haplotype , data=d)
  return(fit$coefficients)
}

# Set the seed for reproducibility
set.seed(12)

# Bootstrap the Cox model
boot_results <- boot(data=avatars_tibble_knn5, statistic=cox_model, R=100)

# Convert bootstrap results to a data frame for ggplot2
boot_hrs <- exp(boot_results$t) # Convert log(HR) to HR
hr_data <- data.frame(HR=boot_hrs[,1])

# Calculate summary statistics
summary_stats <- quantile(hr_data$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1))
names(summary_stats) <- c("Min","2.5th", "5th", "25th", "Median", "75th", "95th","97.5th","Max")

# Create the histogram
ggplot(hr_data, aes(x=HR)) +
  geom_histogram(bins=30, fill="#007a86", color="black") +
  # geom_vline(aes(xintercept=summary_stats["Min"]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=summary_stats["25th"]), color="gray", linetype="dashed", linewidth=2) +
  geom_vline(aes(xintercept=summary_stats["Median"]), color="blue", linetype="dashed", linewidth=2) +
  geom_vline(aes(xintercept=summary_stats["75th"]), color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Max"]), color="purple", linetype="dashed") +
  labs(title="Bootstrap Distribution of Hazard Ratios", x="Hazard Ratio (HR)", y="Frequency") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

# Print summary statistics
print(summary_stats)
##       Min     2.5th       5th      25th    Median      75th      95th    97.5th 
##  6.517357  6.905228  7.364727  8.959359 11.096550 13.685278 20.473030 22.418775 
##       Max 
## 44.264827

bootstraping of the original data for comparison

# Set the seed for reproducibility
set.seed(12)

# Bootstrap the Cox model
boot_results <- boot(data=original1, statistic=cox_model, R=100)

# Convert bootstrap results to a data frame for ggplot2
boot_hrs <- exp(boot_results$t) # Convert log(HR) to HR
hr_data <- data.frame(HR=boot_hrs[,1])

# Calculate summary statistics
summary_stats <- quantile(hr_data$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1))
names(summary_stats) <- c("Min","2.5th", "5th", "25th", "Median", "75th", "95th","97.5th","Max")

# Create the histogram
ggplot(hr_data, aes(x=HR)) +
  geom_histogram(bins=30, fill="#007a86", color="black") +
  # geom_vline(aes(xintercept=summary_stats["Min"]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=summary_stats["25th"]), color="gray", linetype="dashed", linewidth=2) +
  geom_vline(aes(xintercept=summary_stats["Median"]), color="blue", linetype="dashed", linewidth=2) +
  geom_vline(aes(xintercept=summary_stats["75th"]), color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Max"]), color="purple", linetype="dashed") +
  labs(title="Bootstrap Distribution of Hazard Ratios", x="Hazard Ratio (HR)", y="Frequency") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

# Print summary statistics
print(summary_stats)
##       Min     2.5th       5th      25th    Median      75th      95th    97.5th 
##  1.761377  1.935648  2.149621  2.864674  3.404484  4.246031  7.064679  7.651689 
##       Max 
## 12.060006
Modele final & KM
km_original <- survfit(Surv(delai_event, event) ~ haplotype, data = original)
km_original_plot <- ggsurvplot(
  km_original,
  data = original,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)
km_original_plot

km_synthetique <- survfit(Surv(delai_event, event) ~ haplotype, data = avatars_tibble_factor_knn5)
km_synthetique_avatar_5 <- ggsurvplot(
  km_synthetique,
  data = avatars_tibble_factor_knn5,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)
km_synthetique_avatar_5

Plots original & synthetic combined

## combine data
combined_df <- rbind(original %>% mutate(group = "original"), avatars_tibble_factor_knn5 %>% mutate(group = "synthetic")) %>% mutate(combined_haplotype = str_c(haplotype,"_", group ))

## fit the model
km_combined <- survfit(Surv(delai_event, event) ~ combined_haplotype, data = combined_df)

# plot
ggsurvplot(fit = km_combined, 
           data = combined_df,
           
  size = 1,                 # change line size
  conf.int = FALSE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.35, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

Graphical exploraiotn of distribution

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
pm_knn5 <- combined_df %>% select(haplotype:delai_event, group) %>% 
  rename(acute_rejection = rejet_aigu, sex_r = sexe_r, sex_d = sexe_d) %>% 
  ggpairs(
  ggplot2::aes(colour = group,alpha = 0.5),
  upper = list(continuous = wrap("cor", size = 1.5)),
  lower=list(combo=wrap("facethist", binwidth=0.5))) + 
  theme(strip.text.x = element_text(size = 5),
           strip.text.y = element_text(size = 5),axis.text = element_text(size = 5))
pm_knn5

ggsave("Figure1_graft_loss.pdf")
## Saving 7 x 5 in image
# ggsave("comparaison_distribution_knn5.pdf")

function to aggregating the HR and CI95 results of Cox models after changing 100 times the seed with knn = 5

Allow to define the variability range of HR for different Avatar generated with different seed but the same knn (inter dataset variability)

# Assuming all your existing functions and necessary libraries are loaded

run_model_with_seed <- function(seed_value) {
 
  
  # Number of neighbors
  k <- 5  # Adjust this based on your requirement
  
  pca_transformed_data <- pca$x
  knn_result <- get.knn(pca_transformed_data, k)
  
  generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
    n <- nrow(pca_transformed_data)
    avatar_weights <- matrix(nrow = n, ncol = k)
    
    for (i in 1:n) {
      # Step 1: Inverse of Distances
      distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
      inverse_distances <- 1 / distances
      
      # Step 2: Random Weights

      random_weights <- rexp(k, rate = 1)
      
      # Step 3: Contribution Factors
  
      shuffled_indices <- sample(k)
      contribution_factors <- 1 / (2^shuffled_indices)
      
      # Step 4: Calculate Weights
      weights <- inverse_distances * random_weights * contribution_factors
      
      # Step 5: Normalize Weights
      normalized_weights <- weights / sum(weights)
      
      avatar_weights[i, ] <- normalized_weights
    }
    
    return(avatar_weights)
  }
  
  
  
  # Generate avatar weights
   set.seed(seed_value)
  avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)
  
  
  # Function to generate avatars in PCA space based on weights
  generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
    n <- nrow(pca_transformed_data)
    avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
    
    for (i in 1:n) {
      weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
      avatars_pca[i, ] <- colSums(weighted_avatars)
    }
    
    return(avatars_pca)
  }
  # Generate avatars in PCA space
  avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)
  

  # Inverse PCA transformation
  inverse_pca <- function(pca_object, pca_data) {
    return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
  }
  avatars_original_scale <- inverse_pca(pca, avatars_pca_space)
  
 
  # Inverse normalization (if the original data was normalized)
  avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
  avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")
  
  avatars_tibble_knn5 <- as_tibble(avatars_rescaled) %>% 
    mutate(haplotype = round(haplotype, digits=0),
           cyp3A5D = round(cyp3A5D, digits=0),
           sexe_r  = round(sexe_r , digits=0),
           sexe_d  = round(sexe_d , digits=0),
           rejet_aigu  = round(rejet_aigu , digits=0),
           event = round(event, digits=0)
           # CYP3A4_1B = round(CYP3A4_1B, digits=0),
           # MDR1_C1236T = round(MDR1_C1236T, digits=0),
           # MDR1_G2677T = round(MDR1_G2677T, digits=0),
           # MDR1_C3435T = round(MDR1_C3435T, digits=0)
    ) 
  
  avatars_tibble_factor_knn5 <- as_tibble(avatars_rescaled) %>% 
    mutate(haplotype = round(haplotype, digits=0),
           cyp3A5D = round(cyp3A5D, digits=0),
           sexe_r  = round(sexe_r , digits=0),
           sexe_d  = round(sexe_d , digits=0),
           rejet_aigu  = round(rejet_aigu , digits=0),
           event = round(event, digits=0)
           # CYP3A4_1B = round(CYP3A4_1B, digits=0),
           # MDR1_C1236T = round(MDR1_C1236T, digits=0),
           # MDR1_G2677T = round(MDR1_G2677T, digits=0),
           # MDR1_C3435T = round(MDR1_C3435T, digits=0)
    ) %>% 
    mutate(haplotype = as.factor(haplotype),
           cyp3A5D = as.factor(cyp3A5D),
           sexe_r = as.factor(sexe_r),
           sexe_d = as.factor(sexe_d),
           # CYP3A4_1B = as.factor(CYP3A4_1B),
           # MDR1_C1236T = as.factor(MDR1_C1236T),
           # MDR1_G2677T = as.factor(MDR1_G2677T),
           # MDR1_C3435T = as.factor(MDR1_C3435T),
           rejet_aigu = as.factor(rejet_aigu))

  
  # Finally, fit the Cox model
  fit <- coxph(Surv(delai_event, event) ~ haplotype , 
               data = avatars_tibble_knn5)

  # Calculate confidence intervals
  ci <- confint(fit)
  
  return(list(fit = fit, ci = ci))
}


extract_hrs_and_cis <- function(model_output) {
  coefs <- model_output$fit$coefficients
  ci <- model_output$ci

  hr <- exp(coefs)
  ci_lower <- exp(ci[,"2.5 %"])
  ci_upper <- exp(ci[,"97.5 %"])

  return(data.frame(variable = names(hr), hr = hr, ci_lower = ci_lower, ci_upper = ci_upper))
}

# Generate a list of seed values
seed_values <- sample(x=100) # Modify this if you need different seed values

# Apply the algorithm with different seed values
model_results <- map(seed_values, run_model_with_seed)

# Extract HR and CI from model results
extracted_results <- map(model_results, extract_hrs_and_cis)

# Combine results into a single data frame
combined_results <- bind_rows(extracted_results)

# Calculate median HR and CI for each variable
# aggregate_metrics <- combined_results %>%
#   group_by(variable) %>%
#   summarize(
#     median_hr = median(hr),
#     median_ci_lower = median(ci_lower),
#     median_ci_upper = median(ci_upper)
#   )
# 
# aggregate_metrics

# Calculate the specified percentiles for HRs for each variable
percentile_metrics <- combined_results %>%
  group_by(variable) %>%
  summarize(
    percentile_0 = quantile(hr, probs = 0),
    percentile_5 = quantile(hr, probs = 0.05),
    percentile_25 = quantile(hr, probs = 0.25),
    percentile_50 = quantile(hr, probs = 0.5),
    percentile_75 = quantile(hr, probs = 0.75),
    percentile_95 = quantile(hr, probs = 0.95),
    percentile_100 = quantile(hr, probs = 1)
  ) %>%
  pivot_longer(-variable, names_to = "Percentile_HR", values_to = "Value_HR") %>% 
  mutate(Value_HR = round(Value_HR, 2))
# percentile_metrics
# datatable(percentile_metrics)
knitr::kable(percentile_metrics %>%  mutate(Value_HR = round(Value_HR, 2)), "simple")
variable Percentile_HR Value_HR
haplotype percentile_0 3.20
haplotype percentile_5 4.07
haplotype percentile_25 5.25
haplotype percentile_50 6.17
haplotype percentile_75 7.30
haplotype percentile_95 9.35
haplotype percentile_100 24.91

function to aggregating the HR and CI95 results of Cox models after changing 100 times the seed with different knn

Allow to define the variability range of HR for different Avatar generated with different seed and different values of knn

# # Assuming all necessary functions (PCA transformation, KNN, avatar generation, etc.) are defined
# 
# run_analysis_for_k_and_seed <- function(k, seed_value) {
#  
# 
#  pca_transformed_data <- pca$x
#   knn_result <- get.knn(pca_transformed_data, k)
#   
#   generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
#     n <- nrow(pca_transformed_data)
#     avatar_weights <- matrix(nrow = n, ncol = k)
#     
#     for (i in 1:n) {
#       # Step 1: Inverse of Distances
#       distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
#       inverse_distances <- 1 / distances
#       
#       # Step 2: Random Weights
# 
#       random_weights <- rexp(k, rate = 1)
#       
#       # Step 3: Contribution Factors
#   
#       shuffled_indices <- sample(k)
#       contribution_factors <- 1 / (2^shuffled_indices)
#       
#       # Step 4: Calculate Weights
#       weights <- inverse_distances * random_weights * contribution_factors
#       
#       # Step 5: Normalize Weights
#       normalized_weights <- weights / sum(weights)
#       
#       avatar_weights[i, ] <- normalized_weights
#     }
#     
#     return(avatar_weights)
#   }
#   
#   
#   
#   # Generate avatar weights
#    set.seed(seed_value)
#   avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)
#   
#   
#   # Function to generate avatars in PCA space based on weights
#   generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
#     n <- nrow(pca_transformed_data)
#     avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
#     
#     for (i in 1:n) {
#       weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
#       avatars_pca[i, ] <- colSums(weighted_avatars)
#     }
#     
#     return(avatars_pca)
#   }
#   # Generate avatars in PCA space
#   avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)
#   
#     # Assuming 'aids_pca' is the PCA object and 'avatars_pca_space' contains the avatars in PCA space
#   # Inverse PCA transformation
#   inverse_pca <- function(pca_object, pca_data) {
#     return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
#   }
#   avatars_original_scale <- inverse_pca(pca, avatars_pca_space)
#   
# 
#   # Inverse normalization (if the original data was normalized)
#   avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
#   avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")
#   
#   avatars_tibble_knn <- as_tibble(avatars_rescaled) %>% 
#     mutate(haplotype = round(haplotype, digits=0),
#            cyp3A5D = round(cyp3A5D, digits=0),
#            sexe_r  = round(sexe_r , digits=0),
#            sexe_d  = round(sexe_d , digits=0),
#            rejet_aigu  = round(rejet_aigu , digits=0),
#            event = round(event, digits=0)
#            # CYP3A4_1B = round(CYP3A4_1B, digits=0),
#            # MDR1_C1236T = round(MDR1_C1236T, digits=0),
#            # MDR1_G2677T = round(MDR1_G2677T, digits=0),
#            # MDR1_C3435T = round(MDR1_C3435T, digits=0)
#     ) 
#   
#   avatars_tibble_factor_knn <- as_tibble(avatars_rescaled) %>% 
#     mutate(haplotype = round(haplotype, digits=0),
#            cyp3A5D = round(cyp3A5D, digits=0),
#            sexe_r  = round(sexe_r , digits=0),
#            sexe_d  = round(sexe_d , digits=0),
#            rejet_aigu  = round(rejet_aigu , digits=0),
#            event = round(event, digits=0)
#            # CYP3A4_1B = round(CYP3A4_1B, digits=0),
#            # MDR1_C1236T = round(MDR1_C1236T, digits=0),
#            # MDR1_G2677T = round(MDR1_G2677T, digits=0),
#            # MDR1_C3435T = round(MDR1_C3435T, digits=0)
#     ) %>% 
#     mutate(haplotype = as.factor(haplotype),
#            cyp3A5D = as.factor(cyp3A5D),
#            sexe_r = as.factor(sexe_r),
#            sexe_d = as.factor(sexe_d),
#            # CYP3A4_1B = as.factor(CYP3A4_1B),
#            # MDR1_C1236T = as.factor(MDR1_C1236T),
#            # MDR1_G2677T = as.factor(MDR1_G2677T),
#            # MDR1_C3435T = as.factor(MDR1_C3435T),
#            rejet_aigu = as.factor(rejet_aigu))
# 
#   
#   # Finally, fit the Cox model
#   fit <- coxph(Surv(delai_event, event) ~ haplotype , data = avatars_tibble_knn)
# 
# 
#   # Calculate confidence intervals
#   ci <- confint(fit)
# 
#   return(list(fit = fit, ci = ci))
# }
# 
# extract_hrs_and_cis <- function(model_output) {
#   coefs <- model_output$fit$coefficients
#   ci <- model_output$ci
# 
#   hr <- exp(coefs)
#   ci_lower <- exp(ci[,"2.5 %"])
#   ci_upper <- exp(ci[,"97.5 %"])
# 
#   return(data.frame(variable = names(hr), hr = hr, ci_lower = ci_lower, ci_upper = ci_upper))
# }
# 
# run_for_k_values <- function(k) {
#   seed_values <- sample(x = 100)
#   model_results <- map(seed_values, ~run_analysis_for_k_and_seed(k, .x))
#   extracted_results <- map(model_results, extract_hrs_and_cis)
#   combined_results <- bind_rows(extracted_results)
#   
#   aggregate_metrics <- combined_results %>%
#     group_by(variable) %>%
#     summarize(
#       percentile_0 = quantile(hr, probs = 0, na.rm = TRUE),
#       percentile_5 = quantile(hr, probs = 0.05, na.rm = TRUE),
#       percentile_25 = quantile(hr, probs = 0.25, na.rm = TRUE),
#       percentile_50 = quantile(hr, probs = 0.5, na.rm = TRUE),
#       percentile_75 = quantile(hr, probs = 0.75, na.rm = TRUE),
#       percentile_95 = quantile(hr, probs = 0.95, na.rm = TRUE),
#       percentile_100 = quantile(hr, probs = 1, na.rm = TRUE)
#     )
#   
#  
#   
#   return(aggregate_metrics)
# }
# 
# # Define different k values
# k_values <- c(3, 5, 10, 15, 20, 50)
# 
# # Apply the analysis for each k value
# results_for_k_values <- map(k_values, run_for_k_values)
# names(results_for_k_values) <- paste("K =", k_values)
# 
# results_for_k_values
# 
# # datatable(percentile_metrics)
# knitr::kable(results_for_k_values, "simple")

data augmentation avatar

With 5 knn

We investigate the effect of data augmentaiotn with a defined seed and knn=5

data_augment_avatar <- function(x) {
data_normalized <- scale(original1)
pca <- prcomp(data_normalized, scale. = FALSE)# pour selecitonner le nombre de cp rank. = 3
# Number of neighbors
k <- 5  # Adjust this based on your requirement
pca_transformed_data <- pca$x
knn_result <- get.knn(pca_transformed_data, k)

generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
  n <- nrow(pca_transformed_data)
  avatar_weights <- matrix(nrow = n, ncol = k)
  
  for (i in 1:n) {
    # Step 1: Inverse of Distances
    distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
    inverse_distances <- 1 / distances
    
    # Step 2: Random Weights

    random_weights <- rexp(k, rate = 1)
    
    # Step 3: Contribution Factors
 
    shuffled_indices <- sample(k)
    contribution_factors <- 1 / (2^shuffled_indices)
    
    # Step 4: Calculate Weights
    weights <- inverse_distances * random_weights * contribution_factors
    
    # Step 5: Normalize Weights
    normalized_weights <- weights / sum(weights)
    
    avatar_weights[i, ] <- normalized_weights
  }
  
  return(avatar_weights)
}



# Generate avatar weights
   set.seed( str_c(1,x))
avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)

# Assuming pca_result, avatar_weights, knn_result$nn.index, and pca_transformed_data are already defined

# Function to generate avatars in PCA space based on weights
generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
  n <- nrow(pca_transformed_data)
  avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
  
  for (i in 1:n) {
    weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
    avatars_pca[i, ] <- colSums(weighted_avatars)
  }
  
  return(avatars_pca)
}
# Generate avatars in PCA space
avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)
# Assuming 'aids_pca' is the PCA object and 'avatars_pca_space' contains the avatars in PCA space
# Inverse PCA transformation
inverse_pca <- function(pca_object, pca_data) {
  return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
}
avatars_original_scale <- inverse_pca(pca, avatars_pca_space)

# Assuming 'aids_data_normalized' contains the scaling attributes of the original data
# Inverse normalization (if the original data was normalized)
avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")

avatars_tibble <- as_tibble(avatars_rescaled) %>% 
  mutate(haplotype = round(haplotype, digits=0),
         cyp3A5D = round(cyp3A5D, digits=0),
         sexe_r  = round(sexe_r , digits=0),
         sexe_d  = round(sexe_d , digits=0),
         rejet_aigu  = round(rejet_aigu , digits=0),
         event = round(event, digits=0)
         # CYP3A4_1B = round(CYP3A4_1B, digits=0),
         # MDR1_C1236T = round(MDR1_C1236T, digits=0),
         # MDR1_G2677T = round(MDR1_G2677T, digits=0),
         # MDR1_C3435T = round(MDR1_C3435T, digits=0)
  ) 
}

iteration <- c(1:4)

augmented_data_5 <- map_dfr(iteration, data_augment_avatar, .id = "iter_")

augmented_data_5 <- augmented_data_5 %>% 
  select(-iter_)

augmented_data_5_factor_knn5 <- augmented_data_5 %>% 
 mutate(haplotype = as.factor(haplotype),
         cyp3A5D = as.factor(cyp3A5D),
         sexe_r = as.factor(sexe_r),
         sexe_d = as.factor(sexe_d),
         # CYP3A4_1B = as.factor(CYP3A4_1B),
         # MDR1_C1236T = as.factor(MDR1_C1236T),
         # MDR1_G2677T = as.factor(MDR1_G2677T),
         # MDR1_C3435T = as.factor(MDR1_C3435T),
         rejet_aigu = as.factor(rejet_aigu))

Plot of the synthetic and original in the latent space

# Combine original and synthetic data for visualization
combined_data <- rbind(
  original1 %>% mutate(DataType = 'Original'),
  augmented_data_5 %>% mutate(DataType = 'Synthetic')
)

# Perform PCA on combined data
combined_data_normalized <- scale(combined_data[, -which(names(combined_data) %in% c("DataType", "id"))])
combined_pca <- prcomp(combined_data_normalized, scale. = FALSE)

# Extract the first two principal components
combined_pca_data <- data.frame(combined_pca$x[, 1:2])
combined_pca_data$DataType <- combined_data$DataType

# Plot PCA with color differentiation
ggplot(combined_pca_data, aes(x = PC1, y = PC2, color = DataType)) +
  geom_point(alpha = 0.5) +
  theme_minimal() +
  labs(title = "PCA Plot", x = "Principal Component 1", y = "Principal Component 2", color = "Data Type")

Export data augmented knn = 5

write_csv(augmented_data_5, file = "avatar_sfpt_knn5_data_augmented.csv")

Comparison of the datasets augmented and original

Summary of the 2 datasets

## Vector of categorical variables that need transformation
catVars <- c("haplotype", "cyp3A5D",  "sexe_r",  "sexe_d", 
"rejet_aigu", "event")
## Create a variable list.
vars <- c( "haplotype", "cyp3A5D", "age_r", "sexe_r", "age_d", "sexe_d", 
"rejet_aigu", "TIF", "event", "delai_event", "DataType")
tableOne <- CreateTableOne(vars = vars, strata = "DataType",factorVars = catVars, data = combined_data)
tableOne2<-print(tableOne, nonnormal = c( "age_r", "age_d", "TIF", "delai_event"), printToggle=F, minMax=T)
Original Synthetic p test
n 253 1012
haplotype (%) <0.001
1 97 (38.3) 373 ( 36.9)
2 123 (48.6) 577 ( 57.0)
3 33 (13.0) 62 ( 6.1)
cyp3A5D = 2 (%) 211 (83.4) 859 ( 84.9) 0.627
age_r (median [range]) 55.00 [19.00, 78.00] 55.42 [23.08, 77.36] 0.722 nonnorm
sexe_r = 2 (%) 156 (61.7) 656 ( 64.8) 0.387
age_d (median [range]) 40.00 [12.00, 73.00] 39.96 [15.04, 68.49] 0.382 nonnorm
sexe_d = 2 (%) 174 (68.8) 717 ( 70.8) 0.569
rejet_aigu = 2 (%) 81 (32.0) 294 ( 29.1) 0.397
TIF (median [range]) 1153.00 [303.00, 2580.00] 1158.71 [456.35, 2362.15] 0.895 nonnorm
event = 1 (%) 22 ( 8.7) 86 ( 8.5) 1.000
delai_event (median [range]) 5.34 [0.68, 15.83] 5.45 [0.97, 14.94] 0.516 nonnorm
DataType = Synthetic (%) 0 ( 0.0) 1012 (100.0) <0.001
summary(original)
##  haplotype   cyp3A5D       age_r       sexe_r      age_d       sexe_d 
##  autre: 97   Es : 42   Min.   :19.00   F: 97   Min.   :12.00   F: 79  
##  het  :123   NEs:211   1st Qu.:44.00   M:156   1st Qu.:25.00   M:174  
##  hom  : 33             Median :55.00           Median :40.00          
##                        Mean   :53.84           Mean   :38.49          
##                        3rd Qu.:64.00           3rd Qu.:49.00          
##                        Max.   :78.00           Max.   :73.00          
##  rejet_aigu      TIF           event          delai_event    
##  0:172      Min.   : 303   Min.   :0.00000   Min.   : 0.680  
##  1: 81      1st Qu.: 975   1st Qu.:0.00000   1st Qu.: 2.920  
##             Median :1153   Median :0.00000   Median : 5.340  
##             Mean   :1199   Mean   :0.08696   Mean   : 6.044  
##             3rd Qu.:1368   3rd Qu.:0.00000   3rd Qu.: 8.700  
##             Max.   :2580   Max.   :1.00000   Max.   :15.830
summary(augmented_data_5)
##    haplotype        cyp3A5D          age_r           sexe_r     
##  Min.   :1.000   Min.   :1.000   Min.   :23.08   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:46.18   1st Qu.:1.000  
##  Median :2.000   Median :2.000   Median :55.42   Median :2.000  
##  Mean   :1.693   Mean   :1.849   Mean   :53.57   Mean   :1.648  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:62.12   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :2.000   Max.   :77.36   Max.   :2.000  
##      age_d           sexe_d        rejet_aigu         TIF        
##  Min.   :15.04   Min.   :1.000   Min.   :1.000   Min.   : 456.3  
##  1st Qu.:30.19   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1045.9  
##  Median :39.96   Median :2.000   Median :1.000   Median :1158.7  
##  Mean   :39.29   Mean   :1.708   Mean   :1.291   Mean   :1174.6  
##  3rd Qu.:47.90   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1278.0  
##  Max.   :68.49   Max.   :2.000   Max.   :2.000   Max.   :2362.1  
##      event          delai_event     
##  Min.   :0.00000   Min.   : 0.9673  
##  1st Qu.:0.00000   1st Qu.: 3.8059  
##  Median :0.00000   Median : 5.4479  
##  Mean   :0.08498   Mean   : 5.8418  
##  3rd Qu.:0.00000   3rd Qu.: 7.3332  
##  Max.   :1.00000   Max.   :14.9361

individual data explorer

# boxplots
plot_boxplot(combined_data , by ="DataType") 

# histograms

# Function to create histogram for each continuous variable
plot_histograms <- function(data, var_name, group_var) {
  ggplot(data, aes(x = !!sym(var_name), fill = !!sym(group_var))) +
    geom_histogram(alpha = 0.5,show.legend = FALSE) +
    labs(x = var_name, y = "Count") +
    theme_minimal() +
    ggtitle(paste(var_name))
}

# Using select_if to identify continuous variables and map to apply the function
plots <- combined_data %>%
  select( -sexe_r,-sexe_d) %>% 
  select_if(is.numeric) %>%
  names() %>%
  map(~plot_histograms(combined_data, ., "DataType"))

# Optionally, print or arrange plots (e.g., using gridExtra or patchwork packages)

wrap_plots(plots)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot correlation

##Correlation Analysis
  cor_real <- cor(original1, use = "complete.obs")
  cor_synthetic <- cor(augmented_data_5, use = "complete.obs")
  
# plots
ggcorrplot(cor_real, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

# plots
ggcorrplot(cor_synthetic, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

Modele de Cox

# original
fit_original <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = original1)
## 
##   n= 253, number of events= 22 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.2112071  3.3575352  0.3463273  3.497  0.00047 ***
## cyp3A5D    -1.2323909  0.2915946  0.5567303 -2.214  0.02685 *  
## age_r      -0.0039521  0.9960557  0.0187880 -0.210  0.83339    
## sexe_r     -0.0438849  0.9570641  0.4606422 -0.095  0.92410    
## age_d       0.0360206  1.0366772  0.0203668  1.769  0.07696 .  
## sexe_d      0.2786636  1.3213627  0.5181402  0.538  0.59070    
## rejet_aigu  1.0124644  2.7523756  0.4804379  2.107  0.03508 *  
## TIF        -0.0002268  0.9997732  0.0005753 -0.394  0.69345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.3575     0.2978   1.70305    6.6193
## cyp3A5D       0.2916     3.4294   0.09792    0.8683
## age_r         0.9961     1.0040   0.96004    1.0334
## sexe_r        0.9571     1.0449   0.38801    2.3607
## age_d         1.0367     0.9646   0.99611    1.0789
## sexe_d        1.3214     0.7568   0.47861    3.6481
## rejet_aigu    2.7524     0.3633   1.07339    7.0576
## TIF           0.9998     1.0002   0.99865    1.0009
## 
## Concordance= 0.758  (se = 0.054 )
## Likelihood ratio test= 24.6  on 8 df,   p=0.002
## Wald test            = 21.09  on 8 df,   p=0.007
## Score (logrank) test = 25.84  on 8 df,   p=0.001
ggforest(fit_original)

# synthetique
fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = augmented_data_5)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = augmented_data_5)
## 
##   n= 1012, number of events= 86 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.8218542  6.1833132  0.2007713  9.074  < 2e-16 ***
## cyp3A5D    -1.0012032  0.3674371  0.3486473 -2.872  0.00408 ** 
## age_r       0.0442697  1.0452643  0.0135693  3.263  0.00110 ** 
## sexe_r      0.2190860  1.2449383  0.2474680  0.885  0.37599    
## age_d       0.0392892  1.0400712  0.0146711  2.678  0.00741 ** 
## sexe_d      2.2586712  9.5703635  0.3732126  6.052 1.43e-09 ***
## rejet_aigu  1.1536414  3.1697141  0.2689862  4.289 1.80e-05 ***
## TIF        -0.0009711  0.9990294  0.0005792 -1.677  0.09362 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     6.1833     0.1617    4.1718    9.1647
## cyp3A5D       0.3674     2.7216    0.1855    0.7277
## age_r         1.0453     0.9567    1.0178    1.0734
## sexe_r        1.2449     0.8033    0.7665    2.0221
## age_d         1.0401     0.9615    1.0106    1.0704
## sexe_d        9.5704     0.1045    4.6052   19.8888
## rejet_aigu    3.1697     0.3155    1.8709    5.3701
## TIF           0.9990     1.0010    0.9979    1.0002
## 
## Concordance= 0.854  (se = 0.017 )
## Likelihood ratio test= 156.1  on 8 df,   p=<2e-16
## Wald test            = 136.3  on 8 df,   p=<2e-16
## Score (logrank) test = 167.8  on 8 df,   p=<2e-16
ggforest(fit_synthetique)

BootstepAIC augmented synthetic knn5

boot.stepAIC(fit_synthetique, augmented_data_5, B = 100, k=log(nrow(augmented_data_5)))
## 
## Summary of Bootstrapping the 'stepAIC()' procedure for
## 
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = augmented_data_5)
## 
## Bootstrap samples: 100 
## Direction: backward 
## Penalty: 6.92 * df
## 
## Covariates selected
##            (%)
## haplotype  100
## sexe_d     100
## rejet_aigu  85
## age_r       67
## age_d       59
## cyp3A5D     36
## TIF         15
## sexe_r       7
## 
## Coefficients Sign
##            + (%) - (%)
## age_d        100     0
## age_r        100     0
## haplotype    100     0
## rejet_aigu   100     0
## sexe_d       100     0
## sexe_r       100     0
## cyp3A5D        0   100
## TIF            0   100
## 
## Stat Significance
##            (%)
## age_d      100
## age_r      100
## cyp3A5D    100
## haplotype  100
## rejet_aigu 100
## sexe_d     100
## sexe_r     100
## TIF        100
## 
## 
## The stepAIC() for the original data-set gave
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + age_r + 
##     sexe_d + rejet_aigu, data = augmented_data_5)
## 
##               coef exp(coef) se(coef)     z        p
## haplotype  1.86461   6.45345  0.19723 9.454  < 2e-16
## age_r      0.03991   1.04072  0.01287 3.100  0.00193
## sexe_d     1.73449   5.66604  0.33156 5.231 1.68e-07
## rejet_aigu 1.18776   3.27974  0.26848 4.424 9.69e-06
## 
## Likelihood ratio test=139.8  on 4 df, p=< 2.2e-16
## n= 1012, number of events= 86 
## 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_r + sexe_r + 
##     age_d + sexe_d + rejet_aigu + TIF
## 
## Final Model:
## Surv(delai_event, event) ~ haplotype + age_r + sexe_d + rejet_aigu
## 
## 
##        Step Df  Deviance Resid. Df Resid. Dev      AIC
## 1                               78  -156.1183 885.8237
## 2  - sexe_r  1 0.7927363        79  -155.3256 879.6968
## 3     - TIF  1 3.1531373        80  -152.1724 875.9302
## 4 - cyp3A5D  1 5.9361800        81  -146.2362 874.9467
## 5   - age_d  1 6.4243828        82  -139.8119 874.4514

Final model original

fit_original <- coxph(Surv(delai_event, event) ~ haplotype + rejet_aigu , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + rejet_aigu, 
##     data = original1)
## 
##   n= 253, number of events= 22 
## 
##              coef exp(coef) se(coef)     z Pr(>|z|)    
## haplotype  1.1681    3.2160   0.3261 3.582 0.000341 ***
## rejet_aigu 0.9238    2.5188   0.4661 1.982 0.047482 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype      3.216     0.3109     1.697     6.094
## rejet_aigu     2.519     0.3970     1.010     6.280
## 
## Concordance= 0.732  (se = 0.05 )
## Likelihood ratio test= 18.24  on 2 df,   p=1e-04
## Wald test            = 17.44  on 2 df,   p=2e-04
## Score (logrank) test = 19.29  on 2 df,   p=6e-05

Final model synthetic

fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + age_r + sexe_d + rejet_aigu , data = augmented_data_5)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + age_r + 
##     sexe_d + rejet_aigu, data = augmented_data_5)
## 
##   n= 1012, number of events= 86 
## 
##               coef exp(coef) se(coef)     z Pr(>|z|)    
## haplotype  1.86461   6.45345  0.19723 9.454  < 2e-16 ***
## age_r      0.03991   1.04072  0.01287 3.100  0.00193 ** 
## sexe_d     1.73449   5.66604  0.33156 5.231 1.68e-07 ***
## rejet_aigu 1.18776   3.27974  0.26848 4.424 9.69e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype      6.453     0.1550     4.384     9.499
## age_r          1.041     0.9609     1.015     1.067
## sexe_d         5.666     0.1765     2.958    10.852
## rejet_aigu     3.280     0.3049     1.938     5.551
## 
## Concordance= 0.849  (se = 0.019 )
## Likelihood ratio test= 139.8  on 4 df,   p=<2e-16
## Wald test            = 129.7  on 4 df,   p=<2e-16
## Score (logrank) test = 152.8  on 4 df,   p=<2e-16

Modele final & KM

km_original <- survfit(Surv(delai_event, event) ~ haplotype, data = original)
ggsurvplot(
  km_original,
  data = original,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

km_synthetique <- survfit(Surv(delai_event, event) ~ haplotype, data = augmented_data_5_factor_knn5)
km_synthetique_avatar_5_augmented <- ggsurvplot(
  km_synthetique,
  data = augmented_data_5_factor_knn5,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)
km_synthetique_avatar_5_augmented

Plots original & synthetic combined

## combine data
combined_df <- rbind(original %>% mutate(group = "original"), augmented_data_5_factor_knn5 %>% mutate(group = "synthetic")) %>% mutate(combined_haplotype = str_c(haplotype,"_", group ))

## fit the model
km_combined <- survfit(Surv(delai_event, event) ~ combined_haplotype, data = combined_df)

# plot
ggsurvplot(fit = km_combined, 
           data = combined_df,
           
  size = 1,                 # change line size
  conf.int = FALSE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.35, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

Graphical exploraiotn of distribution

library(GGally)

pm_knn5_augmented <- combined_df %>% select(haplotype:delai_event, group) %>% ggpairs(
  ggplot2::aes(colour = group,alpha = 0.5),
  upper = list(continuous = wrap("cor", size = 1.5)),
  lower=list(combo=wrap("facethist", binwidth=0.5))) + 
  theme(strip.text.x = element_text(size = 5),
           strip.text.y = element_text(size = 5),axis.text = element_text(size = 5))
pm_knn5_augmented

# ggsave("comparaison_distribution_augmented_knn5.pdf")

Bootstrap of the coefficient for haplotype

Allow to define the variability range of HR for a given dataset (intra dataset variability)

# Define the Cox model
cox_model <- function(data, indices) {
  d <- data[indices,] # allows bootstrapping to sample the data
  fit <- coxph(Surv(delai_event, event) ~ haplotype + age_r + sexe_d + rejet_aigu , data=d)
  return(fit$coefficients)
}

# Set the seed for reproducibility
set.seed(12)

# Bootstrap the Cox model
boot_results <- boot(data=augmented_data_5, statistic=cox_model, R=100)

# Convert bootstrap results to a data frame for ggplot2
boot_hrs <- exp(boot_results$t) # Convert log(HR) to HR
hr_data_haplo <- data.frame(HR=boot_hrs[,1])
hr_data_age_r <- data.frame(HR=boot_hrs[,2])
hr_data_sexe_d <- data.frame(HR=boot_hrs[,3])
hr_data_rejet_aigu <- data.frame(HR=boot_hrs[,4])
# Calculate summary statistics
summary_stats <- quantile(hr_data_haplo$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) %>%  
  bind_rows(quantile(hr_data_age_r$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) ) %>% 
  bind_rows(quantile(hr_data_sexe_d$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) ) %>% 
  bind_rows(quantile(hr_data_rejet_aigu$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) )  
names(summary_stats) <- c("Min","2.5th", "5th", "25th", "Median", "75th", "95th","97.5th","Max")

# Create the histogram
ggplot(hr_data_haplo, aes(x=HR)) +
  geom_histogram(bins=30, fill="#007a86", color="black") +
  # # geom_vline(aes(xintercept=summary_stats["Min"]), color="red", linetype="dashed") +
  # geom_vline(aes(xintercept=summary_stats["25th"][[1]][[1]]), color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Median"][[1]][[1]]), color="blue", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["75th"])[[1]][[1]], color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Max"]), color="purple", linetype="dashed") +
  labs(title="Bootstrap Distribution of Hazard Ratios", x="Hazard Ratio (HR)", y="Frequency") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

# Print summary statistics
knitr::kable(summary_stats, "simple")
Min 2.5th 5th 25th Median 75th 95th 97.5th Max
4.519891 5.175495 5.293237 5.976295 6.677334 7.692327 8.682031 8.968824 9.451237
1.012048 1.022511 1.024808 1.035579 1.039754 1.046965 1.057784 1.063154 1.068696
2.167422 3.325784 3.557883 5.048900 6.003292 7.209618 8.929283 11.123687 20.182587
1.533517 1.826242 1.947511 2.620763 3.133227 3.898546 5.604837 6.093282 7.779243

Allow to define the inter variability range of HR for augmented knn=5 (inter dataset variability) by using 100 bootraps

# Assuming all your existing functions and necessary libraries are loaded

run_model_with_seed <- function(seed_value) {
  
  
  # augmentaiotn of data
  
  data_augment_avatar <- function(x) {
    data_normalized <- scale(original1)
    pca <- prcomp(data_normalized, scale. = FALSE)# pour selecitonner le nombre de cp rank. = 3
    # Number of neighbors
    k <- 5 # Adjust this based on your requirement
    pca_transformed_data <- pca$x
    knn_result <- get.knn(pca_transformed_data, k)
    
    generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
      n <- nrow(pca_transformed_data)
      avatar_weights <- matrix(nrow = n, ncol = k)
      
      for (i in 1:n) {
        # Step 1: Inverse of Distances
        distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
        inverse_distances <- 1 / distances
        
        # Step 2: Random Weights
        
        random_weights <- rexp(k, rate = 1)
        
        # Step 3: Contribution Factors
        
        shuffled_indices <- sample(k)
        contribution_factors <- 1 / (2^shuffled_indices)
        
        # Step 4: Calculate Weights
        weights <- inverse_distances * random_weights * contribution_factors
        
        # Step 5: Normalize Weights
        normalized_weights <- weights / sum(weights)
        
        avatar_weights[i, ] <- normalized_weights
      }
      
      return(avatar_weights)
    }
    
    
    
    # Generate avatar weights
 
    avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)
    
    # Assuming pca_result, avatar_weights, knn_result$nn.index, and pca_transformed_data are already defined
    
    # Function to generate avatars in PCA space based on weights
    generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
      n <- nrow(pca_transformed_data)
      avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
      
      for (i in 1:n) {
        weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
        avatars_pca[i, ] <- colSums(weighted_avatars)
      }
      
      return(avatars_pca)
    }
    # Generate avatars in PCA space
    avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)
    # Assuming 'aids_pca' is the PCA object and 'avatars_pca_space' contains the avatars in PCA space
    # Inverse PCA transformation
    inverse_pca <- function(pca_object, pca_data) {
      return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
    }
    avatars_original_scale <- inverse_pca(pca, avatars_pca_space)
    
    # Assuming 'aids_data_normalized' contains the scaling attributes of the original data
    # Inverse normalization (if the original data was normalized)
    avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
    avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")
    
    avatars_tibble <- as_tibble(avatars_rescaled) %>% 
      mutate(haplotype = round(haplotype, digits=0),
             cyp3A5D = round(cyp3A5D, digits=0),
             sexe_r  = round(sexe_r , digits=0),
             sexe_d  = round(sexe_d , digits=0),
             rejet_aigu  = round(rejet_aigu , digits=0),
             event = round(event, digits=0)
             # CYP3A4_1B = round(CYP3A4_1B, digits=0),
             # MDR1_C1236T = round(MDR1_C1236T, digits=0),
             # MDR1_G2677T = round(MDR1_G2677T, digits=0),
             # MDR1_C3435T = round(MDR1_C3435T, digits=0)
      ) 
  }
  
  iteration <- c(1:4)
  set.seed(seed_value)
  augmented_data_x <- map_dfr(iteration, data_augment_avatar, .id = "iter_")
  
  augmented_data_x <- augmented_data_x %>% 
    select(-iter_)
  
  
  ###############
  # Finally, fit the Cox model
  fit <- coxph(Surv(delai_event, event) ~ haplotype +age_r + sexe_d + rejet_aigu, 
               data = augmented_data_x)
  coefs <- fit$coefficients
  hr <- exp(coefs)
  return(data.frame(variable = names(hr), hr = hr))
  # Calculate confidence intervals
#  ci <- confint(fit)
  
 # return(list(fit = fit, ci = ci))
}




# Generate a list of seed values
seed_value <- sample(x=100) # Modify this if you need different seed values

# Apply the algorithm with different seed values
model_results <- map(seed_value, run_model_with_seed)

# Extract HR and CI from model results
#extracted_results <- map(model_results, extract_hrs_and_cis)

# Combine results into a single data frame
combined_results <- bind_rows(model_results)

# Calculate median HR and CI for each variable
# aggregate_metrics <- combined_results %>%
#   group_by(variable) %>%
#   summarize(
#     median_hr = median(hr),
#     median_ci_lower = median(ci_lower),
#     median_ci_upper = median(ci_upper)
#   )
# 
# aggregate_metrics

# Calculate the specified percentiles for HRs for each variable
percentile_metrics <- combined_results %>%
  group_by(variable) %>%
  summarize(
    percentile_0 = quantile(hr, probs = 0),
    percentile_5 = quantile(hr, probs = 0.05),
    percentile_25 = quantile(hr, probs = 0.25),
    percentile_50 = quantile(hr, probs = 0.5),
    percentile_75 = quantile(hr, probs = 0.75),
    percentile_95 = quantile(hr, probs = 0.95),
    percentile_100 = quantile(hr, probs = 1)
  ) %>%
  pivot_longer(-variable, names_to = "Percentile_HR", values_to = "Value_HR") %>% 
  mutate(Value_HR = round(Value_HR, 2))
# percentile_metrics
# datatable(percentile_metrics)
knitr::kable(percentile_metrics %>%  mutate(Value_HR = round(Value_HR, 2)), "simple")
variable Percentile_HR Value_HR
age_r percentile_0 1.02
age_r percentile_5 1.03
age_r percentile_25 1.03
age_r percentile_50 1.04
age_r percentile_75 1.04
age_r percentile_95 1.05
age_r percentile_100 1.05
haplotype percentile_0 4.04
haplotype percentile_5 4.44
haplotype percentile_25 5.08
haplotype percentile_50 5.53
haplotype percentile_75 6.06
haplotype percentile_95 7.27
haplotype percentile_100 8.37
rejet_aigu percentile_0 2.43
rejet_aigu percentile_5 3.05
rejet_aigu percentile_25 3.56
rejet_aigu percentile_50 4.17
rejet_aigu percentile_75 4.71
rejet_aigu percentile_95 6.13
rejet_aigu percentile_100 7.58
sexe_d percentile_0 1.29
sexe_d percentile_5 1.90
sexe_d percentile_25 2.49
sexe_d percentile_50 2.95
sexe_d percentile_75 3.49
sexe_d percentile_95 4.35
sexe_d percentile_100 8.81

KNN=20

# Number of neighbors
k <- 20  # Adjust this based on your requirement

algorithm

pca_transformed_data <- pca$x
knn_result <- get.knn(pca_transformed_data, k)

generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
  n <- nrow(pca_transformed_data)
  avatar_weights <- matrix(nrow = n, ncol = k)
  
  for (i in 1:n) {
    # Step 1: Inverse of Distances
    distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
    inverse_distances <- 1 / distances
    
    # Step 2: Random Weights
   
    random_weights <- rexp(k, rate = 1)
    
    # Step 3: Contribution Factors
   
    shuffled_indices <- sample(k)
    contribution_factors <- 1 / (2^shuffled_indices)
    
    # Step 4: Calculate Weights
    weights <- inverse_distances * random_weights * contribution_factors
    
    # Step 5: Normalize Weights
    normalized_weights <- weights / sum(weights)
    
    avatar_weights[i, ] <- normalized_weights
  }
  
  return(avatar_weights)
}



# Generate avatar weights
 set.seed(12)
avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)

Generation of avatar in the latent space

# Assuming pca_result, avatar_weights, knn_result$nn.index, and pca_transformed_data are already defined

# Function to generate avatars in PCA space based on weights
generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
  n <- nrow(pca_transformed_data)
  avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
  
  for (i in 1:n) {
    weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
    avatars_pca[i, ] <- colSums(weighted_avatars)
  }
  
  return(avatars_pca)
}
# Generate avatars in PCA space
avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)

Return to the initial scale

# Assuming 'aids_pca' is the PCA object and 'avatars_pca_space' contains the avatars in PCA space
# Inverse PCA transformation
inverse_pca <- function(pca_object, pca_data) {
  return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
}
avatars_original_scale <- inverse_pca(pca, avatars_pca_space)

# Assuming 'aids_data_normalized' contains the scaling attributes of the original data
# Inverse normalization (if the original data was normalized)
avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")

Transform into tibble

avatars_tibble_knn20 <- as_tibble(avatars_rescaled) %>% 
  mutate(haplotype = round(haplotype, digits=0),
         cyp3A5D = round(cyp3A5D, digits=0),
         sexe_r  = round(sexe_r , digits=0),
         sexe_d  = round(sexe_d , digits=0),
         rejet_aigu  = round(rejet_aigu , digits=0),
         event = round(event, digits=0)
         # CYP3A4_1B = round(CYP3A4_1B, digits=0),
         # MDR1_C1236T = round(MDR1_C1236T, digits=0),
         # MDR1_G2677T = round(MDR1_G2677T, digits=0),
         # MDR1_C3435T = round(MDR1_C3435T, digits=0)
  ) 

avatars_tibble_factor_knn20 <- as_tibble(avatars_rescaled) %>% 
  mutate(haplotype = round(haplotype, digits=0),
         cyp3A5D = round(cyp3A5D, digits=0),
         sexe_r  = round(sexe_r , digits=0),
         sexe_d  = round(sexe_d , digits=0),
         rejet_aigu  = round(rejet_aigu , digits=0),
         event = round(event, digits=0)
         # CYP3A4_1B = round(CYP3A4_1B, digits=0),
         # MDR1_C1236T = round(MDR1_C1236T, digits=0),
         # MDR1_G2677T = round(MDR1_G2677T, digits=0),
         # MDR1_C3435T = round(MDR1_C3435T, digits=0)
  ) %>% 
  mutate(haplotype = as.factor(haplotype),
         cyp3A5D = as.factor(cyp3A5D),
         sexe_r = as.factor(sexe_r),
         sexe_d = as.factor(sexe_d),
         # CYP3A4_1B = as.factor(CYP3A4_1B),
         # MDR1_C1236T = as.factor(MDR1_C1236T),
         # MDR1_G2677T = as.factor(MDR1_G2677T),
         # MDR1_C3435T = as.factor(MDR1_C3435T),
         rejet_aigu = as.factor(rejet_aigu))

Plot of the synthetic and original in the latent space

# Combine original and synthetic data for visualization
combined_data <- rbind(
  original1 %>% mutate(DataType = 'Original'),
  avatars_tibble_knn20 %>% mutate(DataType = 'Synthetic')
)

# Perform PCA on combined data
combined_data_normalized <- scale(combined_data[, -which(names(combined_data) %in% c("DataType", "id"))])
combined_pca <- prcomp(combined_data_normalized, scale. = FALSE)

# Extract the first two principal components
combined_pca_data <- data.frame(combined_pca$x[, 1:2])
combined_pca_data$DataType <- combined_data$DataType

# Plot PCA with color differentiation
ggplot(combined_pca_data, aes(x = PC1, y = PC2, color = DataType)) +
  geom_point(alpha = 0.8) +
  theme_minimal() +
  labs(title = "PCA Plot", x = "Principal Component 1", y = "Principal Component 2", color = "Data Type")

Comparison of the datasets

Summary of the 2 datasets

## Vector of categorical variables that need transformation
catVars <- c("haplotype", "cyp3A5D",  "sexe_r",  "sexe_d", 
"rejet_aigu", "event")
## Create a variable list.
vars <- c( "haplotype", "cyp3A5D", "age_r", "sexe_r", "age_d", "sexe_d", 
"rejet_aigu", "TIF", "event", "delai_event", "DataType")
tableOne <- CreateTableOne(vars = vars, strata = "DataType",factorVars = catVars, data = combined_data)
tableOne2<-print(tableOne, nonnormal = c( "age_r", "age_d", "TIF", "delai_event"), printToggle=F, minMax=T)
Original Synthetic p test
n 253 253
haplotype (%) 0.001
1 97 (38.3) 79 ( 31.2)
2 123 (48.6) 159 ( 62.8)
3 33 (13.0) 15 ( 5.9)
cyp3A5D = 2 (%) 211 (83.4) 224 ( 88.5) 0.125
age_r (median [range]) 55.00 [19.00, 78.00] 55.12 [25.46, 74.57] 0.715 nonnorm
sexe_r = 2 (%) 156 (61.7) 178 ( 70.4) 0.049
age_d (median [range]) 40.00 [12.00, 73.00] 39.04 [18.38, 67.75] 0.643 nonnorm
sexe_d = 2 (%) 174 (68.8) 188 ( 74.3) 0.200
rejet_aigu = 2 (%) 81 (32.0) 63 ( 24.9) 0.094
TIF (median [range]) 1153.00 [303.00, 2580.00] 1135.02 [630.81, 2091.07] 0.271 nonnorm
event = 1 (%) 22 ( 8.7) 13 ( 5.1) 0.161
delai_event (median [range]) 5.34 [0.68, 15.83] 5.34 [1.20, 15.10] 0.779 nonnorm
DataType = Synthetic (%) 0 ( 0.0) 253 (100.0) <0.001
summary(original)
##  haplotype   cyp3A5D       age_r       sexe_r      age_d       sexe_d 
##  autre: 97   Es : 42   Min.   :19.00   F: 97   Min.   :12.00   F: 79  
##  het  :123   NEs:211   1st Qu.:44.00   M:156   1st Qu.:25.00   M:174  
##  hom  : 33             Median :55.00           Median :40.00          
##                        Mean   :53.84           Mean   :38.49          
##                        3rd Qu.:64.00           3rd Qu.:49.00          
##                        Max.   :78.00           Max.   :73.00          
##  rejet_aigu      TIF           event          delai_event    
##  0:172      Min.   : 303   Min.   :0.00000   Min.   : 0.680  
##  1: 81      1st Qu.: 975   1st Qu.:0.00000   1st Qu.: 2.920  
##             Median :1153   Median :0.00000   Median : 5.340  
##             Mean   :1199   Mean   :0.08696   Mean   : 6.044  
##             3rd Qu.:1368   3rd Qu.:0.00000   3rd Qu.: 8.700  
##             Max.   :2580   Max.   :1.00000   Max.   :15.830
summary(avatars_tibble_knn20)
##    haplotype        cyp3A5D          age_r           sexe_r     
##  Min.   :1.000   Min.   :1.000   Min.   :25.46   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:46.65   1st Qu.:1.000  
##  Median :2.000   Median :2.000   Median :55.12   Median :2.000  
##  Mean   :1.747   Mean   :1.885   Mean   :53.70   Mean   :1.704  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:61.25   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :2.000   Max.   :74.57   Max.   :2.000  
##      age_d           sexe_d        rejet_aigu         TIF        
##  Min.   :18.38   Min.   :1.000   Min.   :1.000   Min.   : 630.8  
##  1st Qu.:30.96   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1027.6  
##  Median :39.04   Median :2.000   Median :1.000   Median :1135.0  
##  Mean   :39.03   Mean   :1.743   Mean   :1.249   Mean   :1148.5  
##  3rd Qu.:46.51   3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:1253.4  
##  Max.   :67.75   Max.   :2.000   Max.   :2.000   Max.   :2091.1  
##      event          delai_event    
##  Min.   :0.00000   Min.   : 1.204  
##  1st Qu.:0.00000   1st Qu.: 3.946  
##  Median :0.00000   Median : 5.344  
##  Mean   :0.05138   Mean   : 5.742  
##  3rd Qu.:0.00000   3rd Qu.: 7.278  
##  Max.   :1.00000   Max.   :15.099

individual data explorer

# boxplots
plot_boxplot(combined_data , by ="DataType") 

# histograms

# Function to create histogram for each continuous variable
plot_histograms <- function(data, var_name, group_var) {
  ggplot(data, aes(x = !!sym(var_name), fill = !!sym(group_var))) +
    geom_histogram(alpha = 0.5,show.legend = FALSE) +
    labs(x = var_name, y = "Count") +
    theme_minimal() +
    ggtitle(paste(var_name))
}

# Using select_if to identify continuous variables and map to apply the function
plots <- combined_data %>%
  select( -sexe_r,-sexe_d) %>% 
  select_if(is.numeric) %>%
  names() %>%
  map(~plot_histograms(combined_data, ., "DataType"))

# Optionally, print or arrange plots (e.g., using gridExtra or patchwork packages)

wrap_plots(plots)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot correlation

##Correlation Analysis
  cor_real <- cor(original1, use = "complete.obs")
  cor_synthetic <- cor(avatars_tibble_knn20, use = "complete.obs")
  
# plots
ggcorrplot(cor_real, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

# plots
ggcorrplot(cor_synthetic, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

Modele de Cox

# original
fit_original <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = original1)
## 
##   n= 253, number of events= 22 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.2112071  3.3575352  0.3463273  3.497  0.00047 ***
## cyp3A5D    -1.2323909  0.2915946  0.5567303 -2.214  0.02685 *  
## age_r      -0.0039521  0.9960557  0.0187880 -0.210  0.83339    
## sexe_r     -0.0438849  0.9570641  0.4606422 -0.095  0.92410    
## age_d       0.0360206  1.0366772  0.0203668  1.769  0.07696 .  
## sexe_d      0.2786636  1.3213627  0.5181402  0.538  0.59070    
## rejet_aigu  1.0124644  2.7523756  0.4804379  2.107  0.03508 *  
## TIF        -0.0002268  0.9997732  0.0005753 -0.394  0.69345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.3575     0.2978   1.70305    6.6193
## cyp3A5D       0.2916     3.4294   0.09792    0.8683
## age_r         0.9961     1.0040   0.96004    1.0334
## sexe_r        0.9571     1.0449   0.38801    2.3607
## age_d         1.0367     0.9646   0.99611    1.0789
## sexe_d        1.3214     0.7568   0.47861    3.6481
## rejet_aigu    2.7524     0.3633   1.07339    7.0576
## TIF           0.9998     1.0002   0.99865    1.0009
## 
## Concordance= 0.758  (se = 0.054 )
## Likelihood ratio test= 24.6  on 8 df,   p=0.002
## Wald test            = 21.09  on 8 df,   p=0.007
## Score (logrank) test = 25.84  on 8 df,   p=0.001
ggforest(fit_original)

# synthetique
fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF, data = avatars_tibble_knn20)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = avatars_tibble_knn20)
## 
##   n= 253, number of events= 13 
## 
##                 coef exp(coef)  se(coef)      z Pr(>|z|)  
## haplotype   2.179268  8.839834  0.955361  2.281   0.0225 *
## cyp3A5D    -1.619692  0.197960  0.839800 -1.929   0.0538 .
## age_r      -0.004690  0.995321  0.036846 -0.127   0.8987  
## sexe_r     -0.465605  0.627755  0.710697 -0.655   0.5124  
## age_d       0.053252  1.054696  0.046420  1.147   0.2513  
## sexe_d     -0.372408  0.689073  0.635743 -0.586   0.5580  
## rejet_aigu  1.140204  3.127407  0.641104  1.779   0.0753 .
## TIF        -0.003864  0.996143  0.001955 -1.977   0.0480 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     8.8398     0.1131   1.35906    57.497
## cyp3A5D       0.1980     5.0515   0.03817     1.027
## age_r         0.9953     1.0047   0.92598     1.070
## sexe_r        0.6278     1.5930   0.15590     2.528
## age_d         1.0547     0.9481   0.96297     1.155
## sexe_d        0.6891     1.4512   0.19821     2.396
## rejet_aigu    3.1274     0.3198   0.89018    10.987
## TIF           0.9961     1.0039   0.99233     1.000
## 
## Concordance= 0.823  (se = 0.055 )
## Likelihood ratio test= 21.87  on 8 df,   p=0.005
## Wald test            = 16.79  on 8 df,   p=0.03
## Score (logrank) test = 21.29  on 8 df,   p=0.006
ggforest(fit_synthetique)

BootstepAIC based on BIC

Allow to see which vairable would have been selected

Original

boot.stepAIC(fit_original, original1, B = 100, k=log(nrow(original1)))
## 
## Summary of Bootstrapping the 'stepAIC()' procedure for
## 
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = original1)
## 
## Bootstrap samples: 100 
## Direction: backward 
## Penalty: 5.53 * df
## 
## Covariates selected
##            (%)
## haplotype   95
## rejet_aigu  47
## cyp3A5D     29
## age_d       21
## sexe_d       3
## Null         2
## sexe_r       2
## TIF          2
## 
## Coefficients Sign
##            + (%) - (%)
## age_d        100     0
## haplotype    100     0
## rejet_aigu   100     0
## sexe_d       100     0
## sexe_r       100     0
## cyp3A5D        0   100
## TIF            0   100
## 
## Stat Significance
##            (%)
## age_d      100
## cyp3A5D    100
## haplotype  100
## rejet_aigu 100
## sexe_d     100
## sexe_r     100
## TIF        100
## 
## 
## The stepAIC() for the original data-set gave
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype, data = original1)
## 
##             coef exp(coef) se(coef)     z        p
## haplotype 1.2035    3.3319   0.3276 3.674 0.000239
## 
## Likelihood ratio test=14.01  on 1 df, p=0.0001822
## n= 253, number of events= 22 
## 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_r + sexe_r + 
##     age_d + sexe_d + rejet_aigu + TIF
## 
## Final Model:
## Surv(delai_event, event) ~ haplotype
## 
## 
##           Step Df    Deviance Resid. Df Resid. Dev      AIC
## 1                                    14  -24.59675 212.4446
## 2     - sexe_r  1 0.009067423        15  -24.58769 206.9203
## 3      - age_r  1 0.039034149        16  -24.54865 201.4259
## 4        - TIF  1 0.251427799        17  -24.29723 196.1439
## 5     - sexe_d  1 0.442126797        18  -23.85510 191.0527
## 6      - age_d  1 2.811491990        19  -21.04361 188.3308
## 7    - cyp3A5D  1 2.805921958        20  -18.23768 185.6033
## 8 - rejet_aigu  1 4.230950507        21  -14.00673 184.3009

synhtetic knn20

boot.stepAIC(fit_synthetique, avatars_tibble_knn20, B = 100, k=log(nrow(avatars_tibble_knn20)))
## 
## Summary of Bootstrapping the 'stepAIC()' procedure for
## 
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = avatars_tibble_knn20)
## 
## Bootstrap samples: 100 
## Direction: backward 
## Penalty: 5.53 * df
## 
## Covariates selected
##            (%)
## haplotype   83
## rejet_aigu  53
## TIF         43
## cyp3A5D     34
## age_d       17
## sexe_r       7
## sexe_d       6
## age_r        5
## Null         4
## 
## Coefficients Sign
##             + (%)  - (%)
## haplotype  100.00   0.00
## rejet_aigu 100.00   0.00
## age_d       94.12   5.88
## age_r       80.00  20.00
## sexe_d      16.67  83.33
## cyp3A5D      0.00 100.00
## sexe_r       0.00 100.00
## TIF          0.00 100.00
## 
## Stat Significance
##               (%)
## age_d      100.00
## age_r      100.00
## TIF        100.00
## haplotype   97.59
## cyp3A5D     97.06
## rejet_aigu  90.57
## sexe_r      85.71
## sexe_d      83.33
## 
## 
## The stepAIC() for the original data-set gave
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype, data = avatars_tibble_knn20)
## 
##             coef exp(coef) se(coef)     z      p
## haplotype 1.4907    4.4401   0.5537 2.692 0.0071
## 
## Likelihood ratio test=7.34  on 1 df, p=0.006759
## n= 253, number of events= 13 
## 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_r + sexe_r + 
##     age_d + sexe_d + rejet_aigu + TIF
## 
## Final Model:
## Surv(delai_event, event) ~ haplotype
## 
## 
##           Step Df   Deviance Resid. Df Resid. Dev      AIC
## 1                                    5 -21.869781 142.3860
## 2      - age_r  1 0.01617331         6 -21.853608 136.8688
## 3     - sexe_d  1 0.32081312         7 -21.532795 131.6562
## 4     - sexe_r  1 0.66821937         8 -20.864576 126.7911
## 5      - age_d  1 1.78823983         9 -19.076336 123.0459
## 6    - cyp3A5D  1 2.32166936        10 -16.754666 119.8342
## 7        - TIF  1 5.21665836        11 -11.538008 119.5175
## 8 - rejet_aigu  1 4.20197314        12  -7.336035 118.1860
Final model original
fit_original <- coxph(Surv(delai_event, event) ~ haplotype , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype, data = original1)
## 
##   n= 253, number of events= 22 
## 
##             coef exp(coef) se(coef)     z Pr(>|z|)    
## haplotype 1.2035    3.3319   0.3276 3.674 0.000239 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.332     0.3001     1.753     6.332
## 
## Concordance= 0.682  (se = 0.044 )
## Likelihood ratio test= 14.01  on 1 df,   p=2e-04
## Wald test            = 13.5  on 1 df,   p=2e-04
## Score (logrank) test = 14.91  on 1 df,   p=1e-04
Final model synthetic
fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype  , data = avatars_tibble_knn20)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype, data = avatars_tibble_knn20)
## 
##   n= 253, number of events= 13 
## 
##             coef exp(coef) se(coef)     z Pr(>|z|)   
## haplotype 1.4907    4.4401   0.5537 2.692   0.0071 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## haplotype      4.44     0.2252       1.5     13.14
## 
## Concordance= 0.673  (se = 0.032 )
## Likelihood ratio test= 7.34  on 1 df,   p=0.007
## Wald test            = 7.25  on 1 df,   p=0.007
## Score (logrank) test = 7.03  on 1 df,   p=0.008

Bootstrap of the coefficient for haplotype

Allow to define the variability range of HR for a given dataset (intra dataset variability)

# Define the Cox model
cox_model <- function(data, indices) {
  d <- data[indices,] # allows bootstrapping to sample the data
  fit <- coxph(Surv(delai_event, event) ~ haplotype , data=d)
  return(fit$coefficients)
}

# Set the seed for reproducibility
set.seed(12)

# Bootstrap the Cox model
boot_results <- boot(data=avatars_tibble_knn20, statistic=cox_model, R=100)

# Convert bootstrap results to a data frame for ggplot2
boot_hrs <- exp(boot_results$t) # Convert log(HR) to HR
hr_data <- data.frame(HR=boot_hrs[,1])

# Calculate summary statistics
summary_stats <- quantile(hr_data$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1))
names(summary_stats) <- c("Min","2.5th", "5th", "25th", "Median", "75th", "95th","97.5th","Max")

# Create the histogram
ggplot(hr_data, aes(x=HR)) +
  geom_histogram(bins=30, fill="#007a86", color="black") +
  # geom_vline(aes(xintercept=summary_stats["Min"]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=summary_stats["25th"]), color="gray", linetype="dashed", linewidth=2) +
  geom_vline(aes(xintercept=summary_stats["Median"]), color="blue", linetype="dashed", linewidth=2) +
  geom_vline(aes(xintercept=summary_stats["75th"]), color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Max"]), color="purple", linetype="dashed") +
  labs(title="Bootstrap Distribution of Hazard Ratios", x="Hazard Ratio (HR)", y="Frequency") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

# Print summary statistics
print(summary_stats)
##        Min      2.5th        5th       25th     Median       75th       95th 
##   2.294047   2.553093   2.634223   3.574178   4.495083   6.306976  19.492347 
##     97.5th        Max 
##  28.651190 101.963509
Modele final & KM
km_original <- survfit(Surv(delai_event, event) ~ haplotype, data = original)
km_original_plot <- ggsurvplot(
  km_original,
  data = original,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)
km_original_plot

km_synthetique <- survfit(Surv(delai_event, event) ~ haplotype, data = avatars_tibble_knn20)
km_synthetique_avatar_20 <- ggsurvplot(
  km_synthetique,
  data = avatars_tibble_knn20,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)
km_synthetique_avatar_20

Plots original & synthetic combined

## combine data
combined_df <- rbind(original %>% mutate(group = "original"), avatars_tibble_factor_knn20 %>% mutate(group = "synthetic")) %>% mutate(combined_haplotype = str_c(haplotype,"_", group ))

## fit the model
km_combined <- survfit(Surv(delai_event, event) ~ combined_haplotype, data = combined_df)

# plot
ggsurvplot(fit = km_combined, 
           data = combined_df,
           
  size = 1,                 # change line size
  conf.int = FALSE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.35, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

Graphical exploraiotn of distribution

library(GGally)

pm_knn20 <- combined_df %>% select(haplotype:delai_event, group) %>% ggpairs(
  ggplot2::aes(colour = group,alpha = 0.5),
  upper = list(continuous = wrap("cor", size = 1.5)),
  lower=list(combo=wrap("facethist", binwidth=0.5))) + 
  theme(strip.text.x = element_text(size = 5),
           strip.text.y = element_text(size = 5),axis.text = element_text(size = 5))
pm_knn20

# ggsave("comparaison_distribution_knn20.pdf")

function to aggregating the HR and CI95 results of Cox models after changing 100 times the seed with knn = 20

Allow to define the variability range of HR for different Avatar generated with different seed but the same knn (inter dataset variability)

# Assuming all your existing functions and necessary libraries are loaded

run_model_with_seed <- function(seed_value) {
 
  
  # Number of neighbors
  k <- 20  # Adjust this based on your requirement
  
  pca_transformed_data <- pca$x
  knn_result <- get.knn(pca_transformed_data, k)
  
  generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
    n <- nrow(pca_transformed_data)
    avatar_weights <- matrix(nrow = n, ncol = k)
    
    for (i in 1:n) {
      # Step 1: Inverse of Distances
      distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
      inverse_distances <- 1 / distances
      
      # Step 2: Random Weights

      random_weights <- rexp(k, rate = 1)
      
      # Step 3: Contribution Factors
  
      shuffled_indices <- sample(k)
      contribution_factors <- 1 / (2^shuffled_indices)
      
      # Step 4: Calculate Weights
      weights <- inverse_distances * random_weights * contribution_factors
      
      # Step 5: Normalize Weights
      normalized_weights <- weights / sum(weights)
      
      avatar_weights[i, ] <- normalized_weights
    }
    
    return(avatar_weights)
  }
  
  
  
  # Generate avatar weights
   set.seed(seed_value)
  avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)
  
  
  # Function to generate avatars in PCA space based on weights
  generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
    n <- nrow(pca_transformed_data)
    avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
    
    for (i in 1:n) {
      weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
      avatars_pca[i, ] <- colSums(weighted_avatars)
    }
    
    return(avatars_pca)
  }
  # Generate avatars in PCA space
  avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)
  

  # Inverse PCA transformation
  inverse_pca <- function(pca_object, pca_data) {
    return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
  }
  avatars_original_scale <- inverse_pca(pca, avatars_pca_space)
  
 
  # Inverse normalization (if the original data was normalized)
  avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
  avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")
  
avatars_tibble_knn20   <- as_tibble(avatars_rescaled) %>% 
    mutate(haplotype = round(haplotype, digits=0),
           cyp3A5D = round(cyp3A5D, digits=0),
           sexe_r  = round(sexe_r , digits=0),
           sexe_d  = round(sexe_d , digits=0),
           rejet_aigu  = round(rejet_aigu , digits=0),
           event = round(event, digits=0)
           # CYP3A4_1B = round(CYP3A4_1B, digits=0),
           # MDR1_C1236T = round(MDR1_C1236T, digits=0),
           # MDR1_G2677T = round(MDR1_G2677T, digits=0),
           # MDR1_C3435T = round(MDR1_C3435T, digits=0)
    ) 
  
avatars_tibble_factor_knn20   <- as_tibble(avatars_rescaled) %>% 
    mutate(haplotype = round(haplotype, digits=0),
           cyp3A5D = round(cyp3A5D, digits=0),
           sexe_r  = round(sexe_r , digits=0),
           sexe_d  = round(sexe_d , digits=0),
           rejet_aigu  = round(rejet_aigu , digits=0),
           event = round(event, digits=0)
           # CYP3A4_1B = round(CYP3A4_1B, digits=0),
           # MDR1_C1236T = round(MDR1_C1236T, digits=0),
           # MDR1_G2677T = round(MDR1_G2677T, digits=0),
           # MDR1_C3435T = round(MDR1_C3435T, digits=0)
    ) %>% 
    mutate(haplotype = as.factor(haplotype),
           cyp3A5D = as.factor(cyp3A5D),
           sexe_r = as.factor(sexe_r),
           sexe_d = as.factor(sexe_d),
           # CYP3A4_1B = as.factor(CYP3A4_1B),
           # MDR1_C1236T = as.factor(MDR1_C1236T),
           # MDR1_G2677T = as.factor(MDR1_G2677T),
           # MDR1_C3435T = as.factor(MDR1_C3435T),
           rejet_aigu = as.factor(rejet_aigu))

  
  # Finally, fit the Cox model
  fit <- coxph(Surv(delai_event, event) ~ haplotype , 
               data = avatars_tibble_knn20)

  # Calculate confidence intervals
  ci <- confint(fit)
  
  return(list(fit = fit, ci = ci))
}


extract_hrs_and_cis <- function(model_output) {
  coefs <- model_output$fit$coefficients
  ci <- model_output$ci

  hr <- exp(coefs)
  ci_lower <- exp(ci[,"2.5 %"])
  ci_upper <- exp(ci[,"97.5 %"])

  return(data.frame(variable = names(hr), hr = hr, ci_lower = ci_lower, ci_upper = ci_upper))
}

# Generate a list of seed values
seed_values <- sample(x=100) # Modify this if you need different seed values

# Apply the algorithm with different seed values
model_results <- map(seed_values, run_model_with_seed)

# Extract HR and CI from model results
extracted_results <- map(model_results, extract_hrs_and_cis)

# Combine results into a single data frame
combined_results <- bind_rows(extracted_results)

# Calculate median HR and CI for each variable
# aggregate_metrics <- combined_results %>%
#   group_by(variable) %>%
#   summarize(
#     median_hr = median(hr),
#     median_ci_lower = median(ci_lower),
#     median_ci_upper = median(ci_upper)
#   )
# 
# aggregate_metrics

# Calculate the specified percentiles for HRs for each variable
percentile_metrics <- combined_results %>%
  group_by(variable) %>%
  summarize(
    percentile_0 = quantile(hr, probs = 0),
    percentile_5 = quantile(hr, probs = 0.05),
    percentile_25 = quantile(hr, probs = 0.25),
    percentile_50 = quantile(hr, probs = 0.5),
    percentile_75 = quantile(hr, probs = 0.75),
    percentile_95 = quantile(hr, probs = 0.95),
    percentile_100 = quantile(hr, probs = 1)
  ) %>%
  pivot_longer(-variable, names_to = "Percentile_HR", values_to = "Value_HR") %>% 
  mutate(Value_HR = round(Value_HR, 2))
# percentile_metrics
# datatable(percentile_metrics)
knitr::kable(percentile_metrics %>%  mutate(Value_HR = round(Value_HR, 2)), "simple")
variable Percentile_HR Value_HR
haplotype percentile_0 2.78
haplotype percentile_5 3.49
haplotype percentile_25 4.35
haplotype percentile_50 5.98
haplotype percentile_75 8.04
haplotype percentile_95 12.90
haplotype percentile_100 25.90

data augmentation avatar

With 20 knn

We investigate the effect of data augmentaiotn with a defined seed and knn=20

data_augment_avatar <- function(x) {
data_normalized <- scale(original1)
pca <- prcomp(data_normalized, scale. = FALSE)# pour selecitonner le nombre de cp rank. = 3
# Number of neighbors
k <- 20  # Adjust this based on your requirement
pca_transformed_data <- pca$x
knn_result <- get.knn(pca_transformed_data, k)

generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
  n <- nrow(pca_transformed_data)
  avatar_weights <- matrix(nrow = n, ncol = k)
  
  for (i in 1:n) {
    # Step 1: Inverse of Distances
    distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
    inverse_distances <- 1 / distances
    
    # Step 2: Random Weights

    random_weights <- rexp(k, rate = 1)
    
    # Step 3: Contribution Factors
 
    shuffled_indices <- sample(k)
    contribution_factors <- 1 / (2^shuffled_indices)
    
    # Step 4: Calculate Weights
    weights <- inverse_distances * random_weights * contribution_factors
    
    # Step 5: Normalize Weights
    normalized_weights <- weights / sum(weights)
    
    avatar_weights[i, ] <- normalized_weights
  }
  
  return(avatar_weights)
}



# Generate avatar weights
   set.seed( str_c(1,x))
avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)

# Assuming pca_result, avatar_weights, knn_result$nn.index, and pca_transformed_data are already defined

# Function to generate avatars in PCA space based on weights
generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
  n <- nrow(pca_transformed_data)
  avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
  
  for (i in 1:n) {
    weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
    avatars_pca[i, ] <- colSums(weighted_avatars)
  }
  
  return(avatars_pca)
}
# Generate avatars in PCA space
avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)
# Assuming 'aids_pca' is the PCA object and 'avatars_pca_space' contains the avatars in PCA space
# Inverse PCA transformation
inverse_pca <- function(pca_object, pca_data) {
  return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
}
avatars_original_scale <- inverse_pca(pca, avatars_pca_space)

# Assuming 'aids_data_normalized' contains the scaling attributes of the original data
# Inverse normalization (if the original data was normalized)
avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")

avatars_tibble <- as_tibble(avatars_rescaled) %>% 
  mutate(haplotype = round(haplotype, digits=0),
         cyp3A5D = round(cyp3A5D, digits=0),
         sexe_r  = round(sexe_r , digits=0),
         sexe_d  = round(sexe_d , digits=0),
         rejet_aigu  = round(rejet_aigu , digits=0),
         event = round(event, digits=0)
         # CYP3A4_1B = round(CYP3A4_1B, digits=0),
         # MDR1_C1236T = round(MDR1_C1236T, digits=0),
         # MDR1_G2677T = round(MDR1_G2677T, digits=0),
         # MDR1_C3435T = round(MDR1_C3435T, digits=0)
  ) 
}

iteration <- c(1:4)

augmented_data_20 <- map_dfr(iteration, data_augment_avatar, .id = "iter_")

augmented_data_20 <- augmented_data_20 %>% 
  select(-iter_)

augmented_data_20_factor_knn20 <- augmented_data_20 %>% 
 mutate(haplotype = as.factor(haplotype),
         cyp3A5D = as.factor(cyp3A5D),
         sexe_r = as.factor(sexe_r),
         sexe_d = as.factor(sexe_d),
         # CYP3A4_1B = as.factor(CYP3A4_1B),
         # MDR1_C1236T = as.factor(MDR1_C1236T),
         # MDR1_G2677T = as.factor(MDR1_G2677T),
         # MDR1_C3435T = as.factor(MDR1_C3435T),
         rejet_aigu = as.factor(rejet_aigu))

Plot of the synthetic and original in the latent space

# Combine original and synthetic data for visualization
combined_data <- rbind(
  original1 %>% mutate(DataType = 'Original'),
  augmented_data_20 %>% mutate(DataType = 'Synthetic')
)

# Perform PCA on combined data
combined_data_normalized <- scale(combined_data[, -which(names(combined_data) %in% c("DataType", "id"))])
combined_pca <- prcomp(combined_data_normalized, scale. = FALSE)

# Extract the first two principal components
combined_pca_data <- data.frame(combined_pca$x[, 1:2])
combined_pca_data$DataType <- combined_data$DataType

# Plot PCA with color differentiation
ggplot(combined_pca_data, aes(x = PC1, y = PC2, color = DataType)) +
  geom_point(alpha = 0.5) +
  theme_minimal() +
  labs(title = "PCA Plot", x = "Principal Component 1", y = "Principal Component 2", color = "Data Type")

Export data augmented knn = 20

write_csv(augmented_data_20, file = "avatar_sfpt_knn20_data_augmented.csv")

Comparison of the datasets augmented and original

Summary of the 2 datasets

## Vector of categorical variables that need transformation
catVars <- c("haplotype", "cyp3A5D",  "sexe_r",  "sexe_d", 
"rejet_aigu", "event")
## Create a variable list.
vars <- c( "haplotype", "cyp3A5D", "age_r", "sexe_r", "age_d", "sexe_d", 
"rejet_aigu", "TIF", "event", "delai_event", "DataType")
tableOne <- CreateTableOne(vars = vars, strata = "DataType",factorVars = catVars, data = combined_data)
tableOne2<-print(tableOne, nonnormal = c( "age_r", "age_d", "TIF", "delai_event"), printToggle=F, minMax=T)
Original Synthetic p test
n 253 1012
haplotype (%) <0.001
1 97 (38.3) 345 ( 34.1)
2 123 (48.6) 614 ( 60.7)
3 33 (13.0) 53 ( 5.2)
cyp3A5D = 2 (%) 211 (83.4) 909 ( 89.8) 0.006
age_r (median [range]) 55.00 [19.00, 78.00] 55.60 [24.90, 75.77] 0.755 nonnorm
sexe_r = 2 (%) 156 (61.7) 700 ( 69.2) 0.027
age_d (median [range]) 40.00 [12.00, 73.00] 39.73 [16.63, 67.75] 0.479 nonnorm
sexe_d = 2 (%) 174 (68.8) 743 ( 73.4) 0.161
rejet_aigu = 2 (%) 81 (32.0) 244 ( 24.1) 0.013
TIF (median [range]) 1153.00 [303.00, 2580.00] 1144.15 [576.99, 2091.07] 0.516 nonnorm
event = 1 (%) 22 ( 8.7) 55 ( 5.4) 0.073
delai_event (median [range]) 5.34 [0.68, 15.83] 5.21 [1.02, 15.10] 0.989 nonnorm
DataType = Synthetic (%) 0 ( 0.0) 1012 (100.0) <0.001
summary(original)
##  haplotype   cyp3A5D       age_r       sexe_r      age_d       sexe_d 
##  autre: 97   Es : 42   Min.   :19.00   F: 97   Min.   :12.00   F: 79  
##  het  :123   NEs:211   1st Qu.:44.00   M:156   1st Qu.:25.00   M:174  
##  hom  : 33             Median :55.00           Median :40.00          
##                        Mean   :53.84           Mean   :38.49          
##                        3rd Qu.:64.00           3rd Qu.:49.00          
##                        Max.   :78.00           Max.   :73.00          
##  rejet_aigu      TIF           event          delai_event    
##  0:172      Min.   : 303   Min.   :0.00000   Min.   : 0.680  
##  1: 81      1st Qu.: 975   1st Qu.:0.00000   1st Qu.: 2.920  
##             Median :1153   Median :0.00000   Median : 5.340  
##             Mean   :1199   Mean   :0.08696   Mean   : 6.044  
##             3rd Qu.:1368   3rd Qu.:0.00000   3rd Qu.: 8.700  
##             Max.   :2580   Max.   :1.00000   Max.   :15.830
summary(augmented_data_20)
##    haplotype        cyp3A5D          age_r           sexe_r     
##  Min.   :1.000   Min.   :1.000   Min.   :24.90   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:46.31   1st Qu.:1.000  
##  Median :2.000   Median :2.000   Median :55.60   Median :2.000  
##  Mean   :1.711   Mean   :1.898   Mean   :53.80   Mean   :1.692  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:61.59   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :2.000   Max.   :75.77   Max.   :2.000  
##      age_d           sexe_d        rejet_aigu         TIF      
##  Min.   :16.63   Min.   :1.000   Min.   :1.000   Min.   : 577  
##  1st Qu.:30.92   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1030  
##  Median :39.73   Median :2.000   Median :1.000   Median :1144  
##  Mean   :39.12   Mean   :1.734   Mean   :1.241   Mean   :1163  
##  3rd Qu.:47.08   3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:1270  
##  Max.   :67.75   Max.   :2.000   Max.   :2.000   Max.   :2091  
##      event          delai_event    
##  Min.   :0.00000   Min.   : 1.024  
##  1st Qu.:0.00000   1st Qu.: 3.758  
##  Median :0.00000   Median : 5.211  
##  Mean   :0.05435   Mean   : 5.675  
##  3rd Qu.:0.00000   3rd Qu.: 7.106  
##  Max.   :1.00000   Max.   :15.099

individual data explorer

# boxplots
plot_boxplot(combined_data , by ="DataType") 

# histograms

# Function to create histogram for each continuous variable
plot_histograms <- function(data, var_name, group_var) {
  ggplot(data, aes(x = !!sym(var_name), fill = !!sym(group_var))) +
    geom_histogram(alpha = 0.5,show.legend = FALSE) +
    labs(x = var_name, y = "Count") +
    theme_minimal() +
    ggtitle(paste(var_name))
}

# Using select_if to identify continuous variables and map to apply the function
plots <- combined_data %>%
  select( -sexe_r,-sexe_d) %>% 
  select_if(is.numeric) %>%
  names() %>%
  map(~plot_histograms(combined_data, ., "DataType"))

# Optionally, print or arrange plots (e.g., using gridExtra or patchwork packages)

wrap_plots(plots)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot correlation

##Correlation Analysis
  cor_real <- cor(original1, use = "complete.obs")
  cor_synthetic <- cor(augmented_data_20, use = "complete.obs")
  
# plots
ggcorrplot(cor_real, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

# plots
ggcorrplot(cor_synthetic, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

Modele de Cox

# original
fit_original <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = original1)
## 
##   n= 253, number of events= 22 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.2112071  3.3575352  0.3463273  3.497  0.00047 ***
## cyp3A5D    -1.2323909  0.2915946  0.5567303 -2.214  0.02685 *  
## age_r      -0.0039521  0.9960557  0.0187880 -0.210  0.83339    
## sexe_r     -0.0438849  0.9570641  0.4606422 -0.095  0.92410    
## age_d       0.0360206  1.0366772  0.0203668  1.769  0.07696 .  
## sexe_d      0.2786636  1.3213627  0.5181402  0.538  0.59070    
## rejet_aigu  1.0124644  2.7523756  0.4804379  2.107  0.03508 *  
## TIF        -0.0002268  0.9997732  0.0005753 -0.394  0.69345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.3575     0.2978   1.70305    6.6193
## cyp3A5D       0.2916     3.4294   0.09792    0.8683
## age_r         0.9961     1.0040   0.96004    1.0334
## sexe_r        0.9571     1.0449   0.38801    2.3607
## age_d         1.0367     0.9646   0.99611    1.0789
## sexe_d        1.3214     0.7568   0.47861    3.6481
## rejet_aigu    2.7524     0.3633   1.07339    7.0576
## TIF           0.9998     1.0002   0.99865    1.0009
## 
## Concordance= 0.758  (se = 0.054 )
## Likelihood ratio test= 24.6  on 8 df,   p=0.002
## Wald test            = 21.09  on 8 df,   p=0.007
## Score (logrank) test = 25.84  on 8 df,   p=0.001
ggforest(fit_original)

# synthetique
fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = augmented_data_20)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = augmented_data_20)
## 
##   n= 1012, number of events= 55 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.1558853  3.1768346  0.2802950  4.124 3.73e-05 ***
## cyp3A5D    -1.3490551  0.2594853  0.3808095 -3.543 0.000396 ***
## age_r       0.0192921  1.0194794  0.0167408  1.152 0.249157    
## sexe_r      0.2466270  1.2797016  0.3022308  0.816 0.414488    
## age_d       0.0318037  1.0323149  0.0191358  1.662 0.096513 .  
## sexe_d     -0.4167056  0.6592149  0.3077915 -1.354 0.175782    
## rejet_aigu  0.9525651  2.5923507  0.2908215  3.275 0.001055 ** 
## TIF        -0.0002825  0.9997175  0.0006848 -0.413 0.679904    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.1768     0.3148    1.8340    5.5028
## cyp3A5D       0.2595     3.8538    0.1230    0.5473
## age_r         1.0195     0.9809    0.9866    1.0535
## sexe_r        1.2797     0.7814    0.7077    2.3140
## age_d         1.0323     0.9687    0.9943    1.0718
## sexe_d        0.6592     1.5170    0.3606    1.2051
## rejet_aigu    2.5924     0.3858    1.4660    4.5840
## TIF           0.9997     1.0003    0.9984    1.0011
## 
## Concordance= 0.76  (se = 0.036 )
## Likelihood ratio test= 57.97  on 8 df,   p=1e-09
## Wald test            = 60.58  on 8 df,   p=4e-10
## Score (logrank) test = 62.39  on 8 df,   p=2e-10
ggforest(fit_synthetique)

BootstepAIC augmented synthetic knn20

boot.stepAIC(fit_synthetique, augmented_data_20, B = 100, k=log(nrow(augmented_data_20)))
## 
## Summary of Bootstrapping the 'stepAIC()' procedure for
## 
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = augmented_data_20)
## 
## Bootstrap samples: 100 
## Direction: backward 
## Penalty: 6.92 * df
## 
## Covariates selected
##            (%)
## haplotype  100
## rejet_aigu  72
## cyp3A5D     66
## age_d       39
## age_r       20
## sexe_d      13
## TIF          9
## sexe_r       6
## 
## Coefficients Sign
##             + (%)  - (%)
## age_d      100.00   0.00
## age_r      100.00   0.00
## haplotype  100.00   0.00
## rejet_aigu 100.00   0.00
## sexe_r     100.00   0.00
## TIF         11.11  88.89
## cyp3A5D      0.00 100.00
## sexe_d       0.00 100.00
## 
## Stat Significance
##            (%)
## age_d      100
## age_r      100
## cyp3A5D    100
## haplotype  100
## rejet_aigu 100
## sexe_d     100
## sexe_r     100
## TIF        100
## 
## 
## The stepAIC() for the original data-set gave
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_d + rejet_aigu, data = augmented_data_20)
## 
##                coef exp(coef) se(coef)      z        p
## haplotype   1.20801   3.34682  0.26892  4.492 7.05e-06
## cyp3A5D    -1.35592   0.25771  0.37927 -3.575  0.00035
## age_d       0.04692   1.04804  0.01769  2.652  0.00800
## rejet_aigu  0.87931   2.40925  0.28225  3.115  0.00184
## 
## Likelihood ratio test=52.78  on 4 df, p=9.484e-11
## n= 1012, number of events= 55 
## 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_r + sexe_r + 
##     age_d + sexe_d + rejet_aigu + TIF
## 
## Final Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_d + rejet_aigu
## 
## 
##       Step Df  Deviance Resid. Df Resid. Dev      AIC
## 1                              47  -57.97016 631.2875
## 2    - TIF  1 0.1732023        48  -57.79696 624.5410
## 3 - sexe_r  1 0.6650970        49  -57.13186 618.2864
## 4  - age_r  1 0.9729519        50  -56.15891 612.3397
## 5 - sexe_d  1 3.3809730        51  -52.77794 608.8010

Final model original

fit_original <- coxph(Surv(delai_event, event) ~ haplotype + rejet_aigu , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + rejet_aigu, 
##     data = original1)
## 
##   n= 253, number of events= 22 
## 
##              coef exp(coef) se(coef)     z Pr(>|z|)    
## haplotype  1.1681    3.2160   0.3261 3.582 0.000341 ***
## rejet_aigu 0.9238    2.5188   0.4661 1.982 0.047482 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype      3.216     0.3109     1.697     6.094
## rejet_aigu     2.519     0.3970     1.010     6.280
## 
## Concordance= 0.732  (se = 0.05 )
## Likelihood ratio test= 18.24  on 2 df,   p=1e-04
## Wald test            = 17.44  on 2 df,   p=2e-04
## Score (logrank) test = 19.29  on 2 df,   p=6e-05

Final model synthetic

fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D + age_d + rejet_aigu, data = augmented_data_20)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_d + rejet_aigu, data = augmented_data_20)
## 
##   n= 1012, number of events= 55 
## 
##                coef exp(coef) se(coef)      z Pr(>|z|)    
## haplotype   1.20801   3.34682  0.26892  4.492 7.05e-06 ***
## cyp3A5D    -1.35592   0.25771  0.37927 -3.575  0.00035 ***
## age_d       0.04692   1.04804  0.01769  2.652  0.00800 ** 
## rejet_aigu  0.87931   2.40925  0.28225  3.115  0.00184 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.3468     0.2988    1.9757     5.669
## cyp3A5D       0.2577     3.8803    0.1225     0.542
## age_d         1.0480     0.9542    1.0123     1.085
## rejet_aigu    2.4092     0.4151    1.3856     4.189
## 
## Concordance= 0.747  (se = 0.035 )
## Likelihood ratio test= 52.78  on 4 df,   p=9e-11
## Wald test            = 51.35  on 4 df,   p=2e-10
## Score (logrank) test = 54.37  on 4 df,   p=4e-11

Modele final & KM

km_original <- survfit(Surv(delai_event, event) ~ haplotype, data = original)
ggsurvplot(
  km_original,
  data = original,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

km_synthetique <- survfit(Surv(delai_event, event) ~ haplotype, data = augmented_data_20_factor_knn20)
km_synthetique_avatar_20_augmented <- ggsurvplot(
  km_synthetique,
  data = augmented_data_20_factor_knn20,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)
km_synthetique_avatar_20_augmented

Plots original & synthetic combined

## combine data
combined_df <- rbind(original %>% mutate(group = "original"), augmented_data_20_factor_knn20 %>% mutate(group = "synthetic")) %>% mutate(combined_haplotype = str_c(haplotype,"_", group ))

## fit the model
km_combined <- survfit(Surv(delai_event, event) ~ combined_haplotype, data = combined_df)

# plot
ggsurvplot(fit = km_combined, 
           data = combined_df,
           
  size = 1,                 # change line size
  conf.int = FALSE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.35, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

Graphical exploraiotn of distribution

library(GGally)

pm_knn20_augmented <- combined_df %>% select(haplotype:delai_event, group) %>% ggpairs(
  ggplot2::aes(colour = group,alpha = 0.5),
  upper = list(continuous = wrap("cor", size = 1.5)),
  lower=list(combo=wrap("facethist", binwidth=0.5))) + 
  theme(strip.text.x = element_text(size = 5),
           strip.text.y = element_text(size = 5),axis.text = element_text(size = 5))
pm_knn20_augmented

# ggsave("comparaison_distribution_augmented_knn20.pdf")

Bootstrap of the coefficient for haplotype

Allow to define the variability range of HR for a given dataset (intra dataset variability)

# Define the Cox model
cox_model <- function(data, indices) {
  d <- data[indices,] # allows bootstrapping to sample the data
  fit <- coxph(Surv(delai_event, event) ~ haplotype + age_d + cyp3A5D + rejet_aigu , data=d)
  return(fit$coefficients)
}

# Set the seed for reproducibility
set.seed(12)

# Bootstrap the Cox model
boot_results <- boot(data=augmented_data_20, statistic=cox_model, R=100)

# Convert bootstrap results to a data frame for ggplot2
boot_hrs <- exp(boot_results$t) # Convert log(HR) to HR
hr_data_haplo <- data.frame(HR=boot_hrs[,1])
hr_data_age_d <- data.frame(HR=boot_hrs[,2])
hr_data_cyp3A5D <- data.frame(HR=boot_hrs[,3])
hr_data_rejet_aigu <- data.frame(HR=boot_hrs[,4])
# Calculate summary statistics
summary_stats <- quantile(hr_data_haplo$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) %>%  
  bind_rows(quantile(hr_data_age_d$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) ) %>% 
  bind_rows(quantile(hr_data_cyp3A5D$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) ) %>% 
  bind_rows(quantile(hr_data_rejet_aigu$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) )  
names(summary_stats) <- c("Min","2.5th", "5th", "25th", "Median", "75th", "95th","97.5th","Max")

# Create the histogram
ggplot(hr_data_haplo, aes(x=HR)) +
  geom_histogram(bins=30, fill="#007a86", color="black") +
  # # geom_vline(aes(xintercept=summary_stats["Min"]), color="red", linetype="dashed") +
  # geom_vline(aes(xintercept=summary_stats["25th"][[1]][[1]]), color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Median"][[1]][[1]]), color="blue", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["75th"])[[1]][[1]], color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Max"]), color="purple", linetype="dashed") +
  labs(title="Bootstrap Distribution of Hazard Ratios", x="Hazard Ratio (HR)", y="Frequency") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

# Print summary statistics
knitr::kable(summary_stats, "simple")
Min 2.5th 5th 25th Median 75th 95th 97.5th Max
2.2104216 2.4173764 2.5090651 2.9659151 3.372666 3.8308308 4.8235844 5.447514 6.068198
1.0024396 1.0218809 1.0260181 1.0405190 1.051443 1.0613370 1.0785523 1.082279 1.096752
0.1185243 0.1262952 0.1374447 0.1961807 0.263547 0.3461597 0.6474621 0.761141 1.247091
1.2164271 1.4634958 1.5824677 1.9565206 2.242277 2.6623821 4.4381318 4.712314 7.128857

inter seed vairbaility augmented knn20

Allow to define the inter variability range of HR for augmented knn=20 (inter dataset variability) by using 100 bootraps

# Assuming all your existing functions and necessary libraries are loaded

run_model_with_seed <- function(seed_value) {
  
  
  # augmentaiotn of data
  
  data_augment_avatar <- function(x) {
    data_normalized <- scale(original1)
    pca <- prcomp(data_normalized, scale. = FALSE)# pour selecitonner le nombre de cp rank. = 3
    # Number of neighbors
    k <- 20 # Adjust this based on your requirement
    pca_transformed_data <- pca$x
    knn_result <- get.knn(pca_transformed_data, k)
    
    generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
      n <- nrow(pca_transformed_data)
      avatar_weights <- matrix(nrow = n, ncol = k)
      
      for (i in 1:n) {
        # Step 1: Inverse of Distances
        distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
        inverse_distances <- 1 / distances
        
        # Step 2: Random Weights
        
        random_weights <- rexp(k, rate = 1)
        
        # Step 3: Contribution Factors
        
        shuffled_indices <- sample(k)
        contribution_factors <- 1 / (2^shuffled_indices)
        
        # Step 4: Calculate Weights
        weights <- inverse_distances * random_weights * contribution_factors
        
        # Step 5: Normalize Weights
        normalized_weights <- weights / sum(weights)
        
        avatar_weights[i, ] <- normalized_weights
      }
      
      return(avatar_weights)
    }
    
    
    
    # Generate avatar weights
 
    avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)
    
    # Assuming pca_result, avatar_weights, knn_result$nn.index, and pca_transformed_data are already defined
    
    # Function to generate avatars in PCA space based on weights
    generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
      n <- nrow(pca_transformed_data)
      avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
      
      for (i in 1:n) {
        weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
        avatars_pca[i, ] <- colSums(weighted_avatars)
      }
      
      return(avatars_pca)
    }
    # Generate avatars in PCA space
    avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)
    # Assuming 'aids_pca' is the PCA object and 'avatars_pca_space' contains the avatars in PCA space
    # Inverse PCA transformation
    inverse_pca <- function(pca_object, pca_data) {
      return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
    }
    avatars_original_scale <- inverse_pca(pca, avatars_pca_space)
    
    # Assuming 'aids_data_normalized' contains the scaling attributes of the original data
    # Inverse normalization (if the original data was normalized)
    avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
    avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")
    
    avatars_tibble <- as_tibble(avatars_rescaled) %>% 
      mutate(haplotype = round(haplotype, digits=0),
             cyp3A5D = round(cyp3A5D, digits=0),
             sexe_r  = round(sexe_r , digits=0),
             sexe_d  = round(sexe_d , digits=0),
             rejet_aigu  = round(rejet_aigu , digits=0),
             event = round(event, digits=0)
             # CYP3A4_1B = round(CYP3A4_1B, digits=0),
             # MDR1_C1236T = round(MDR1_C1236T, digits=0),
             # MDR1_G2677T = round(MDR1_G2677T, digits=0),
             # MDR1_C3435T = round(MDR1_C3435T, digits=0)
      ) 
  }
  
  iteration <- c(1:4)
  set.seed(seed_value)
  augmented_data_x <- map_dfr(iteration, data_augment_avatar, .id = "iter_")
  
  augmented_data_x <- augmented_data_x %>% 
    select(-iter_)
  
  
  ###############
  # Finally, fit the Cox model
  fit <- coxph(Surv(delai_event, event) ~ haplotype + age_d + cyp3A5D + rejet_aigu, 
               data = augmented_data_x)
  coefs <- fit$coefficients
  hr <- exp(coefs)
  return(data.frame(variable = names(hr), hr = hr))
  # Calculate confidence intervals
#  ci <- confint(fit)
  
 # return(list(fit = fit, ci = ci))
}




# Generate a list of seed values
seed_value <- sample(x=100) # Modify this if you need different seed values

# Apply the algorithm with different seed values
model_results <- map(seed_value, run_model_with_seed)

# Extract HR and CI from model results
#extracted_results <- map(model_results, extract_hrs_and_cis)

# Combine results into a single data frame
combined_results <- bind_rows(model_results)

# Calculate median HR and CI for each variable
# aggregate_metrics <- combined_results %>%
#   group_by(variable) %>%
#   summarize(
#     median_hr = median(hr),
#     median_ci_lower = median(ci_lower),
#     median_ci_upper = median(ci_upper)
#   )
# 
# aggregate_metrics

# Calculate the specified percentiles for HRs for each variable
percentile_metrics <- combined_results %>%
  group_by(variable) %>%
  summarize(
    percentile_0 = quantile(hr, probs = 0),
    percentile_5 = quantile(hr, probs = 0.05),
    percentile_25 = quantile(hr, probs = 0.25),
    percentile_50 = quantile(hr, probs = 0.5),
    percentile_75 = quantile(hr, probs = 0.75),
    percentile_95 = quantile(hr, probs = 0.95),
    percentile_100 = quantile(hr, probs = 1)
  ) %>%
  pivot_longer(-variable, names_to = "Percentile_HR", values_to = "Value_HR") %>% 
  mutate(Value_HR = round(Value_HR, 2))
# percentile_metrics
# datatable(percentile_metrics)
knitr::kable(percentile_metrics %>%  mutate(Value_HR = round(Value_HR, 2)), "simple")
variable Percentile_HR Value_HR
age_d percentile_0 1.00
age_d percentile_5 1.02
age_d percentile_25 1.04
age_d percentile_50 1.05
age_d percentile_75 1.06
age_d percentile_95 1.08
age_d percentile_100 1.09
cyp3A5D percentile_0 0.14
cyp3A5D percentile_5 0.19
cyp3A5D percentile_25 0.28
cyp3A5D percentile_50 0.35
cyp3A5D percentile_75 0.48
cyp3A5D percentile_95 0.97
cyp3A5D percentile_100 2.63
haplotype percentile_0 3.32
haplotype percentile_5 3.58
haplotype percentile_25 4.43
haplotype percentile_50 5.30
haplotype percentile_75 5.91
haplotype percentile_95 7.25
haplotype percentile_100 9.70
rejet_aigu percentile_0 1.73
rejet_aigu percentile_5 2.11
rejet_aigu percentile_25 2.80
rejet_aigu percentile_50 3.59
rejet_aigu percentile_75 4.52
rejet_aigu percentile_95 5.83
rejet_aigu percentile_100 8.85

augmented data: function to aggregating the HR and CI95 results of Cox models after changing 100 times the seed with different knn = 3, 5, 10, 15, 20, 50

Allow to define the variability range of HR for different Avatar generated with different seed and different values of knn

# library(tidyverse)
# 
# # Assuming all your existing functions and necessary libraries are loaded
# run_for_k_values <- function(k) {
# run_model_with_seed <- function(seed_value) {
#   
#   
#   # augmentaiotn of data
#   
#   data_augment_avatar <- function(x) {
#     data_normalized <- scale(original1)
#     pca <- prcomp(data_normalized, scale. = FALSE)# pour selecitonner le nombre de cp rank. = 3
#     # Number of neighbors
#     #k <- 20 # Adjust this based on your requirement
#     pca_transformed_data <- pca$x
#     knn_result <- get.knn(pca_transformed_data, k)
#     
#     generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
#       n <- nrow(pca_transformed_data)
#       avatar_weights <- matrix(nrow = n, ncol = k)
#       
#       for (i in 1:n) {
#         # Step 1: Inverse of Distances
#         distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
#         inverse_distances <- 1 / distances
#         
#         # Step 2: Random Weights
#         
#         random_weights <- rexp(k, rate = 1)
#         
#         # Step 3: Contribution Factors
#         
#         shuffled_indices <- sample(k)
#         contribution_factors <- 1 / (2^shuffled_indices)
#         
#         # Step 4: Calculate Weights
#         weights <- inverse_distances * random_weights * contribution_factors
#         
#         # Step 5: Normalize Weights
#         normalized_weights <- weights / sum(weights)
#         
#         avatar_weights[i, ] <- normalized_weights
#       }
#       
#       return(avatar_weights)
#     }
#     
#     
#     
#     # Generate avatar weights
#    
#     avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)
#     
#     # Assuming pca_result, avatar_weights, knn_result$nn.index, and pca_transformed_data are already defined
#     
#     # Function to generate avatars in PCA space based on weights
#     generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
#       n <- nrow(pca_transformed_data)
#       avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
#       
#       for (i in 1:n) {
#         weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
#         avatars_pca[i, ] <- colSums(weighted_avatars)
#       }
#       
#       return(avatars_pca)
#     }
#     # Generate avatars in PCA space
#     avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)
#     # Assuming 'aids_pca' is the PCA object and 'avatars_pca_space' contains the avatars in PCA space
#     # Inverse PCA transformation
#     inverse_pca <- function(pca_object, pca_data) {
#       return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
#     }
#     avatars_original_scale <- inverse_pca(pca, avatars_pca_space)
#     
#     # Assuming 'aids_data_normalized' contains the scaling attributes of the original data
#     # Inverse normalization (if the original data was normalized)
#     avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
#     avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")
#     
#     avatars_tibble <- as_tibble(avatars_rescaled) %>% 
#       mutate(haplotype = round(haplotype, digits=0),
#              cyp3A5D = round(cyp3A5D, digits=0),
#              sexe_r  = round(sexe_r , digits=0),
#              sexe_d  = round(sexe_d , digits=0),
#              rejet_aigu  = round(rejet_aigu , digits=0),
#              event = round(event, digits=0)
#              # CYP3A4_1B = round(CYP3A4_1B, digits=0),
#              # MDR1_C1236T = round(MDR1_C1236T, digits=0),
#              # MDR1_G2677T = round(MDR1_G2677T, digits=0),
#              # MDR1_C3435T = round(MDR1_C3435T, digits=0)
#       ) 
#   }
#   
#   iteration <- c(1:4)
#   # set.seed(seed_value)
#    set.seed(seed_value)
#   augmented_data_x <- map_dfr(iteration, data_augment_avatar, .id = "iter_")
#   
#   augmented_data_x <- augmented_data_x %>% 
#     select(-iter_)
#   
#   
#   ###############
#   # Finally, fit the Cox model
#   fit <- coxph(Surv(delai_event, event) ~ haplotype , 
#                data = augmented_data_x)
#   coefs <- fit$coefficients
#   hr <- exp(coefs)
#   return(data.frame(variable = names(hr), hr = hr))
#   # Calculate confidence intervals
#   #  ci <- confint(fit)
#   
#   # return(list(fit = fit, ci = ci))
# }
# 
# 
# 
# 
# # Generate a list of seed values
# seed_value <- sample(x=100) # Modify this if you need different seed values
# 
# # Apply the algorithm with different seed values
# model_results <- map(seed_value, run_model_with_seed)
# 
# # Extract HR and CI from model results
# #extracted_results <- map(model_results, extract_hrs_and_cis)
# 
# # Combine results into a single data frame
# combined_results <- bind_rows(model_results)
# 
# # Calculate median HR and CI for each variable
# # aggregate_metrics <- combined_results %>%
# #   group_by(variable) %>%
# #   summarize(
# #     median_hr = median(hr),
# #     median_ci_lower = median(ci_lower),
# #     median_ci_upper = median(ci_upper)
# #   )
# # 
# # aggregate_metrics
# 
# # Calculate the specified percentiles for HRs for each variable
# percentile_metrics <- combined_results %>%
#   group_by(variable) %>%
#   summarize(
#     percentile_0 = quantile(hr, probs = 0),
#     percentile_5 = quantile(hr, probs = 0.05),
#     percentile_25 = quantile(hr, probs = 0.25),
#     percentile_50 = quantile(hr, probs = 0.5),
#     percentile_75 = quantile(hr, probs = 0.75),
#     percentile_95 = quantile(hr, probs = 0.95),
#     percentile_100 = quantile(hr, probs = 1)
#   ) %>%
#   pivot_longer(-variable, names_to = "Percentile_HR", values_to = "Value_HR") %>% 
#   mutate(Value_HR = round(Value_HR, 2))
# # percentile_metrics
# 
# return(percentile_metrics)
# }
# # Define different k values
# k_values <- c(3, 5, 10, 15, 20, 50)
# 
# results_list <- map(k_values, run_for_k_values)
# names(results_list) <- paste("K =", k_values)
# 
# 
# # datatable(percentile_metrics)
# knitr::kable(results_list, "simple")

KNN=10

# Number of neighbors
k <- 10  # Adjust this based on your requirement

algorithm

pca_transformed_data <- pca$x
knn_result <- get.knn(pca_transformed_data, k)

generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
  n <- nrow(pca_transformed_data)
  avatar_weights <- matrix(nrow = n, ncol = k)
  
  for (i in 1:n) {
    # Step 1: Inverse of Distances
    distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
    inverse_distances <- 1 / distances
    
    # Step 2: Random Weights
   
    random_weights <- rexp(k, rate = 1)
    
    # Step 3: Contribution Factors
   
    shuffled_indices <- sample(k)
    contribution_factors <- 1 / (2^shuffled_indices)
    
    # Step 4: Calculate Weights
    weights <- inverse_distances * random_weights * contribution_factors
    
    # Step 5: Normalize Weights
    normalized_weights <- weights / sum(weights)
    
    avatar_weights[i, ] <- normalized_weights
  }
  
  return(avatar_weights)
}



# Generate avatar weights
 set.seed(12)
avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)

Generation of avatar in the latent space

# Assuming pca_result, avatar_weights, knn_result$nn.index, and pca_transformed_data are already defined

# Function to generate avatars in PCA space based on weights
generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
  n <- nrow(pca_transformed_data)
  avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
  
  for (i in 1:n) {
    weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
    avatars_pca[i, ] <- colSums(weighted_avatars)
  }
  
  return(avatars_pca)
}
# Generate avatars in PCA space
avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)

Return to the initial scale

# Assuming 'aids_pca' is the PCA object and 'avatars_pca_space' contains the avatars in PCA space
# Inverse PCA transformation
inverse_pca <- function(pca_object, pca_data) {
  return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
}
avatars_original_scale <- inverse_pca(pca, avatars_pca_space)

# Assuming 'aids_data_normalized' contains the scaling attributes of the original data
# Inverse normalization (if the original data was normalized)
avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")

Transform into tibble

avatars_tibble_knn10 <- as_tibble(avatars_rescaled) %>% 
  mutate(haplotype = round(haplotype, digits=0),
         cyp3A5D = round(cyp3A5D, digits=0),
         sexe_r  = round(sexe_r , digits=0),
         sexe_d  = round(sexe_d , digits=0),
         rejet_aigu  = round(rejet_aigu , digits=0),
         event = round(event, digits=0)
         # CYP3A4_1B = round(CYP3A4_1B, digits=0),
         # MDR1_C1236T = round(MDR1_C1236T, digits=0),
         # MDR1_G2677T = round(MDR1_G2677T, digits=0),
         # MDR1_C3435T = round(MDR1_C3435T, digits=0)
  ) 

avatars_tibble_factor_knn10 <- as_tibble(avatars_rescaled) %>% 
  mutate(haplotype = round(haplotype, digits=0),
         cyp3A5D = round(cyp3A5D, digits=0),
         sexe_r  = round(sexe_r , digits=0),
         sexe_d  = round(sexe_d , digits=0),
         rejet_aigu  = round(rejet_aigu , digits=0),
         event = round(event, digits=0)
         # CYP3A4_1B = round(CYP3A4_1B, digits=0),
         # MDR1_C1236T = round(MDR1_C1236T, digits=0),
         # MDR1_G2677T = round(MDR1_G2677T, digits=0),
         # MDR1_C3435T = round(MDR1_C3435T, digits=0)
  ) %>% 
  mutate(haplotype = as.factor(haplotype),
         cyp3A5D = as.factor(cyp3A5D),
         sexe_r = as.factor(sexe_r),
         sexe_d = as.factor(sexe_d),
         # CYP3A4_1B = as.factor(CYP3A4_1B),
         # MDR1_C1236T = as.factor(MDR1_C1236T),
         # MDR1_G2677T = as.factor(MDR1_G2677T),
         # MDR1_C3435T = as.factor(MDR1_C3435T),
         rejet_aigu = as.factor(rejet_aigu))

Plot of the synthetic and original in the latent space

# Combine original and synthetic data for visualization
combined_data <- rbind(
  original1 %>% mutate(DataType = 'Original'),
  avatars_tibble_knn10 %>% mutate(DataType = 'Synthetic')
)

# Perform PCA on combined data
combined_data_normalized <- scale(combined_data[, -which(names(combined_data) %in% c("DataType", "id"))])
combined_pca <- prcomp(combined_data_normalized, scale. = FALSE)

# Extract the first two principal components
combined_pca_data <- data.frame(combined_pca$x[, 1:2])
combined_pca_data$DataType <- combined_data$DataType

# Plot PCA with color differentiation
ggplot(combined_pca_data, aes(x = PC1, y = PC2, color = DataType)) +
  geom_point(alpha = 0.8) +
  theme_minimal() +
  labs(title = "PCA Plot", x = "Principal Component 1", y = "Principal Component 2", color = "Data Type")

Comparison of the datasets

Summary of the 2 datasets

## Vector of categorical variables that need transformation
catVars <- c("haplotype", "cyp3A5D",  "sexe_r",  "sexe_d", 
"rejet_aigu", "event")
## Create a variable list.
vars <- c( "haplotype", "cyp3A5D", "age_r", "sexe_r", "age_d", "sexe_d", 
"rejet_aigu", "TIF", "event", "delai_event", "DataType")
tableOne <- CreateTableOne(vars = vars, strata = "DataType",factorVars = catVars, data = combined_data)
tableOne2<-print(tableOne, nonnormal = c( "age_r", "age_d", "TIF", "delai_event"), printToggle=F, minMax=T)
Original Synthetic p test
n 253 253
haplotype (%) 0.008
1 97 (38.3) 93 ( 36.8)
2 123 (48.6) 146 ( 57.7)
3 33 (13.0) 14 ( 5.5)
cyp3A5D = 2 (%) 211 (83.4) 219 ( 86.6) 0.384
age_r (median [range]) 55.00 [19.00, 78.00] 54.21 [24.23, 73.48] 0.750 nonnorm
sexe_r = 2 (%) 156 (61.7) 165 ( 65.2) 0.460
age_d (median [range]) 40.00 [12.00, 73.00] 39.60 [15.46, 62.80] 0.866 nonnorm
sexe_d = 2 (%) 174 (68.8) 188 ( 74.3) 0.200
rejet_aigu = 2 (%) 81 (32.0) 72 ( 28.5) 0.439
TIF (median [range]) 1153.00 [303.00, 2580.00] 1156.78 [570.38, 1987.46] 0.785 nonnorm
event = 1 (%) 22 ( 8.7) 20 ( 7.9) 0.872
delai_event (median [range]) 5.34 [0.68, 15.83] 5.60 [0.96, 15.33] 0.382 nonnorm
DataType = Synthetic (%) 0 ( 0.0) 253 (100.0) <0.001
summary(original)
##  haplotype   cyp3A5D       age_r       sexe_r      age_d       sexe_d 
##  autre: 97   Es : 42   Min.   :19.00   F: 97   Min.   :12.00   F: 79  
##  het  :123   NEs:211   1st Qu.:44.00   M:156   1st Qu.:25.00   M:174  
##  hom  : 33             Median :55.00           Median :40.00          
##                        Mean   :53.84           Mean   :38.49          
##                        3rd Qu.:64.00           3rd Qu.:49.00          
##                        Max.   :78.00           Max.   :73.00          
##  rejet_aigu      TIF           event          delai_event    
##  0:172      Min.   : 303   Min.   :0.00000   Min.   : 0.680  
##  1: 81      1st Qu.: 975   1st Qu.:0.00000   1st Qu.: 2.920  
##             Median :1153   Median :0.00000   Median : 5.340  
##             Mean   :1199   Mean   :0.08696   Mean   : 6.044  
##             3rd Qu.:1368   3rd Qu.:0.00000   3rd Qu.: 8.700  
##             Max.   :2580   Max.   :1.00000   Max.   :15.830
summary(avatars_tibble_knn10)
##    haplotype        cyp3A5D          age_r           sexe_r     
##  Min.   :1.000   Min.   :1.000   Min.   :24.23   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:46.61   1st Qu.:1.000  
##  Median :2.000   Median :2.000   Median :54.21   Median :2.000  
##  Mean   :1.688   Mean   :1.866   Mean   :53.72   Mean   :1.652  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:62.31   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :2.000   Max.   :73.48   Max.   :2.000  
##      age_d           sexe_d        rejet_aigu         TIF        
##  Min.   :15.46   Min.   :1.000   Min.   :1.000   Min.   : 570.4  
##  1st Qu.:28.89   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1035.3  
##  Median :39.60   Median :2.000   Median :1.000   Median :1156.8  
##  Mean   :38.51   Mean   :1.743   Mean   :1.285   Mean   :1167.6  
##  3rd Qu.:47.61   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1296.9  
##  Max.   :62.80   Max.   :2.000   Max.   :2.000   Max.   :1987.5  
##      event          delai_event     
##  Min.   :0.00000   Min.   : 0.9566  
##  1st Qu.:0.00000   1st Qu.: 4.0307  
##  Median :0.00000   Median : 5.5990  
##  Mean   :0.07905   Mean   : 5.9403  
##  3rd Qu.:0.00000   3rd Qu.: 7.1670  
##  Max.   :1.00000   Max.   :15.3301

individual data explorer

# boxplots
plot_boxplot(combined_data , by ="DataType") 

# histograms

# Function to create histogram for each continuous variable
plot_histograms <- function(data, var_name, group_var) {
  ggplot(data, aes(x = !!sym(var_name), fill = !!sym(group_var))) +
    geom_histogram(alpha = 0.5,show.legend = FALSE) +
    labs(x = var_name, y = "Count") +
    theme_minimal() +
    ggtitle(paste(var_name))
}

# Using select_if to identify continuous variables and map to apply the function
plots <- combined_data %>%
  select( -sexe_r,-sexe_d) %>% 
  select_if(is.numeric) %>%
  names() %>%
  map(~plot_histograms(combined_data, ., "DataType"))

# Optionally, print or arrange plots (e.g., using gridExtra or patchwork packages)

wrap_plots(plots)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot correlation

##Correlation Analysis
  cor_real <- cor(original1, use = "complete.obs")
  cor_synthetic <- cor(avatars_tibble_knn10, use = "complete.obs")
  
# plots
ggcorrplot(cor_real, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

# plots
ggcorrplot(cor_synthetic, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

Modele de Cox

# original
fit_original <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = original1)
## 
##   n= 253, number of events= 22 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.2112071  3.3575352  0.3463273  3.497  0.00047 ***
## cyp3A5D    -1.2323909  0.2915946  0.5567303 -2.214  0.02685 *  
## age_r      -0.0039521  0.9960557  0.0187880 -0.210  0.83339    
## sexe_r     -0.0438849  0.9570641  0.4606422 -0.095  0.92410    
## age_d       0.0360206  1.0366772  0.0203668  1.769  0.07696 .  
## sexe_d      0.2786636  1.3213627  0.5181402  0.538  0.59070    
## rejet_aigu  1.0124644  2.7523756  0.4804379  2.107  0.03508 *  
## TIF        -0.0002268  0.9997732  0.0005753 -0.394  0.69345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.3575     0.2978   1.70305    6.6193
## cyp3A5D       0.2916     3.4294   0.09792    0.8683
## age_r         0.9961     1.0040   0.96004    1.0334
## sexe_r        0.9571     1.0449   0.38801    2.3607
## age_d         1.0367     0.9646   0.99611    1.0789
## sexe_d        1.3214     0.7568   0.47861    3.6481
## rejet_aigu    2.7524     0.3633   1.07339    7.0576
## TIF           0.9998     1.0002   0.99865    1.0009
## 
## Concordance= 0.758  (se = 0.054 )
## Likelihood ratio test= 24.6  on 8 df,   p=0.002
## Wald test            = 21.09  on 8 df,   p=0.007
## Score (logrank) test = 25.84  on 8 df,   p=0.001
ggforest(fit_original)

# synthetique
fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF, data = avatars_tibble_knn10)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = avatars_tibble_knn10)
## 
##   n= 253, number of events= 20 
## 
##                 coef exp(coef)  se(coef)      z Pr(>|z|)    
## haplotype   2.189521  8.930938  0.638419  3.430 0.000604 ***
## cyp3A5D    -0.517214  0.596179  0.867578 -0.596 0.551069    
## age_r       0.011612  1.011680  0.026141  0.444 0.656889    
## sexe_r      0.821917  2.274856  0.637399  1.289 0.197229    
## age_d       0.041406  1.042275  0.029631  1.397 0.162302    
## sexe_d      2.595350 13.401278  1.198788  2.165 0.030389 *  
## rejet_aigu  1.420749  4.140222  0.816441  1.740 0.081828 .  
## TIF        -0.002234  0.997768  0.001226 -1.822 0.068386 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     8.9309    0.11197    2.5555    31.212
## cyp3A5D       0.5962    1.67735    0.1089     3.265
## age_r         1.0117    0.98845    0.9612     1.065
## sexe_r        2.2749    0.43959    0.6522     7.934
## age_d         1.0423    0.95944    0.9835     1.105
## sexe_d       13.4013    0.07462    1.2786   140.461
## rejet_aigu    4.1402    0.24153    0.8357    20.511
## TIF           0.9978    1.00224    0.9954     1.000
## 
## Concordance= 0.851  (se = 0.04 )
## Likelihood ratio test= 41.23  on 8 df,   p=2e-06
## Wald test            = 22.55  on 8 df,   p=0.004
## Score (logrank) test = 42.55  on 8 df,   p=1e-06
ggforest(fit_synthetique)

BootstepAIC based on BIC

Allow to see which vairable would have been selected

Original

boot.stepAIC(fit_original, original1, B = 100, k=log(nrow(original1)))
## 
## Summary of Bootstrapping the 'stepAIC()' procedure for
## 
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = original1)
## 
## Bootstrap samples: 100 
## Direction: backward 
## Penalty: 5.53 * df
## 
## Covariates selected
##            (%)
## haplotype   95
## rejet_aigu  47
## cyp3A5D     29
## age_d       21
## sexe_d       3
## Null         2
## sexe_r       2
## TIF          2
## 
## Coefficients Sign
##            + (%) - (%)
## age_d        100     0
## haplotype    100     0
## rejet_aigu   100     0
## sexe_d       100     0
## sexe_r       100     0
## cyp3A5D        0   100
## TIF            0   100
## 
## Stat Significance
##            (%)
## age_d      100
## cyp3A5D    100
## haplotype  100
## rejet_aigu 100
## sexe_d     100
## sexe_r     100
## TIF        100
## 
## 
## The stepAIC() for the original data-set gave
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype, data = original1)
## 
##             coef exp(coef) se(coef)     z        p
## haplotype 1.2035    3.3319   0.3276 3.674 0.000239
## 
## Likelihood ratio test=14.01  on 1 df, p=0.0001822
## n= 253, number of events= 22 
## 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_r + sexe_r + 
##     age_d + sexe_d + rejet_aigu + TIF
## 
## Final Model:
## Surv(delai_event, event) ~ haplotype
## 
## 
##           Step Df    Deviance Resid. Df Resid. Dev      AIC
## 1                                    14  -24.59675 212.4446
## 2     - sexe_r  1 0.009067423        15  -24.58769 206.9203
## 3      - age_r  1 0.039034149        16  -24.54865 201.4259
## 4        - TIF  1 0.251427799        17  -24.29723 196.1439
## 5     - sexe_d  1 0.442126797        18  -23.85510 191.0527
## 6      - age_d  1 2.811491990        19  -21.04361 188.3308
## 7    - cyp3A5D  1 2.805921958        20  -18.23768 185.6033
## 8 - rejet_aigu  1 4.230950507        21  -14.00673 184.3009

synhtetic knn10

boot.stepAIC(fit_synthetique, avatars_tibble_knn10, B = 100, k=log(nrow(avatars_tibble_knn10)))
## 
## Summary of Bootstrapping the 'stepAIC()' procedure for
## 
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = avatars_tibble_knn10)
## 
## Bootstrap samples: 100 
## Direction: backward 
## Penalty: 5.53 * df
## 
## Covariates selected
##            (%)
## haplotype   98
## sexe_d      66
## rejet_aigu  53
## TIF         39
## age_d       24
## sexe_r      18
## age_r        8
## cyp3A5D      5
## 
## Coefficients Sign
##            + (%) - (%)
## age_d        100     0
## age_r        100     0
## haplotype    100     0
## rejet_aigu   100     0
## sexe_d       100     0
## sexe_r       100     0
## cyp3A5D       20    80
## TIF            0   100
## 
## Stat Significance
##               (%)
## age_d      100.00
## age_r      100.00
## haplotype  100.00
## sexe_r     100.00
## TIF        100.00
## cyp3A5D     80.00
## rejet_aigu  62.26
## sexe_d      62.12
## 
## 
## The stepAIC() for the original data-set gave
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + rejet_aigu, 
##     data = avatars_tibble_knn10)
## 
##              coef exp(coef) se(coef)     z        p
## haplotype  1.4166    4.1230   0.3771 3.756 0.000173
## rejet_aigu 1.9030    6.7057   0.7580 2.510 0.012058
## 
## Likelihood ratio test=28.87  on 2 df, p=5.394e-07
## n= 253, number of events= 20 
## 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_r + sexe_r + 
##     age_d + sexe_d + rejet_aigu + TIF
## 
## Final Model:
## Surv(delai_event, event) ~ haplotype + rejet_aigu
## 
## 
##        Step Df  Deviance Resid. Df Resid. Dev      AIC
## 1                               12  -41.23295 171.7220
## 2   - age_r  1 0.2005000        13  -41.03245 166.3891
## 3 - cyp3A5D  1 0.4592484        14  -40.57320 161.3150
## 4  - sexe_r  1 1.4438502        15  -39.12935 157.2254
## 5   - age_d  1 2.4135185        16  -36.71583 154.1055
## 6     - TIF  1 3.9163173        17  -32.79951 152.4885
## 7  - sexe_d  1 3.9340176        18  -28.86549 150.8891
Final model original
fit_original <- coxph(Surv(delai_event, event) ~ haplotype , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype, data = original1)
## 
##   n= 253, number of events= 22 
## 
##             coef exp(coef) se(coef)     z Pr(>|z|)    
## haplotype 1.2035    3.3319   0.3276 3.674 0.000239 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.332     0.3001     1.753     6.332
## 
## Concordance= 0.682  (se = 0.044 )
## Likelihood ratio test= 14.01  on 1 df,   p=2e-04
## Wald test            = 13.5  on 1 df,   p=2e-04
## Score (logrank) test = 14.91  on 1 df,   p=1e-04
Final model synthetic
fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype  , data = avatars_tibble_knn10)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype, data = avatars_tibble_knn10)
## 
##   n= 253, number of events= 20 
## 
##             coef exp(coef) se(coef)     z Pr(>|z|)    
## haplotype 1.7378    5.6846   0.3963 4.385 1.16e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## haplotype     5.685     0.1759     2.614     12.36
## 
## Concordance= 0.721  (se = 0.035 )
## Likelihood ratio test= 19.27  on 1 df,   p=1e-05
## Wald test            = 19.23  on 1 df,   p=1e-05
## Score (logrank) test = 19.86  on 1 df,   p=8e-06

Bootstrap of the coefficient for haplotype

Allow to define the variability range of HR for a given dataset (intra dataset variability)

# Define the Cox model
cox_model <- function(data, indices) {
  d <- data[indices,] # allows bootstrapping to sample the data
  fit <- coxph(Surv(delai_event, event) ~ haplotype , data=d)
  return(fit$coefficients)
}

# Set the seed for reproducibility
set.seed(12)

# Bootstrap the Cox model
boot_results <- boot(data=avatars_tibble_knn10, statistic=cox_model, R=100)

# Convert bootstrap results to a data frame for ggplot2
boot_hrs <- exp(boot_results$t) # Convert log(HR) to HR
hr_data <- data.frame(HR=boot_hrs[,1])

# Calculate summary statistics
summary_stats <- quantile(hr_data$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1))
names(summary_stats) <- c("Min","2.5th", "5th", "25th", "Median", "75th", "95th","97.5th","Max")

# Create the histogram
ggplot(hr_data, aes(x=HR)) +
  geom_histogram(bins=30, fill="#007a86", color="black") +
  # geom_vline(aes(xintercept=summary_stats["Min"]), color="red", linetype="dashed") +
  geom_vline(aes(xintercept=summary_stats["25th"]), color="gray", linetype="dashed", linewidth=2) +
  geom_vline(aes(xintercept=summary_stats["Median"]), color="blue", linetype="dashed", linewidth=2) +
  geom_vline(aes(xintercept=summary_stats["75th"]), color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Max"]), color="purple", linetype="dashed") +
  labs(title="Bootstrap Distribution of Hazard Ratios", x="Hazard Ratio (HR)", y="Frequency") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

# Print summary statistics
print(summary_stats)
##       Min     2.5th       5th      25th    Median      75th      95th    97.5th 
##  2.612300  3.397346  3.834001  4.983690  5.923145  7.502672 14.680292 16.598891 
##       Max 
## 23.260106
Modele final & KM
km_original <- survfit(Surv(delai_event, event) ~ haplotype, data = original)
km_original_plot <- ggsurvplot(
  km_original,
  data = original,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)
km_original_plot

km_synthetique <- survfit(Surv(delai_event, event) ~ haplotype, data = avatars_tibble_knn10)
km_synthetique_avatar_10 <- ggsurvplot(
  km_synthetique,
  data = avatars_tibble_knn10,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)
km_synthetique_avatar_10

Plots original & synthetic combined

## combine data
combined_df <- rbind(original %>% mutate(group = "original"), avatars_tibble_factor_knn10 %>% mutate(group = "synthetic")) %>% mutate(combined_haplotype = str_c(haplotype,"_", group ))

## fit the model
km_combined <- survfit(Surv(delai_event, event) ~ combined_haplotype, data = combined_df)

# plot
ggsurvplot(fit = km_combined, 
           data = combined_df,
           
  size = 1,                 # change line size
  conf.int = FALSE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.35, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

Graphical exploraiotn of distribution

library(GGally)

pm_knn10 <- combined_df %>% select(haplotype:delai_event, group) %>% ggpairs(
  ggplot2::aes(colour = group,alpha = 0.5),
  upper = list(continuous = wrap("cor", size = 1.5)),
  lower=list(combo=wrap("facethist", binwidth=0.5))) + 
  theme(strip.text.x = element_text(size = 5),
           strip.text.y = element_text(size = 5),axis.text = element_text(size = 5))
pm_knn10

# ggsave("comparaison_distribution_knn10.pdf")

function to aggregating the HR and CI95 results of Cox models after changing 100 times the seed with knn = 10

Allow to define the variability range of HR for different Avatar generated with different seed but the same knn (inter dataset variability)

# Assuming all your existing functions and necessary libraries are loaded

run_model_with_seed <- function(seed_value) {
 
  
  # Number of neighbors
  k <- 10  # Adjust this based on your requirement
  
  pca_transformed_data <- pca$x
  knn_result <- get.knn(pca_transformed_data, k)
  
  generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
    n <- nrow(pca_transformed_data)
    avatar_weights <- matrix(nrow = n, ncol = k)
    
    for (i in 1:n) {
      # Step 1: Inverse of Distances
      distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
      inverse_distances <- 1 / distances
      
      # Step 2: Random Weights

      random_weights <- rexp(k, rate = 1)
      
      # Step 3: Contribution Factors
  
      shuffled_indices <- sample(k)
      contribution_factors <- 1 / (2^shuffled_indices)
      
      # Step 4: Calculate Weights
      weights <- inverse_distances * random_weights * contribution_factors
      
      # Step 5: Normalize Weights
      normalized_weights <- weights / sum(weights)
      
      avatar_weights[i, ] <- normalized_weights
    }
    
    return(avatar_weights)
  }
  
  
  
  # Generate avatar weights
   set.seed(seed_value)
  avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)
  
  
  # Function to generate avatars in PCA space based on weights
  generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
    n <- nrow(pca_transformed_data)
    avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
    
    for (i in 1:n) {
      weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
      avatars_pca[i, ] <- colSums(weighted_avatars)
    }
    
    return(avatars_pca)
  }
  # Generate avatars in PCA space
  avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)
  

  # Inverse PCA transformation
  inverse_pca <- function(pca_object, pca_data) {
    return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
  }
  avatars_original_scale <- inverse_pca(pca, avatars_pca_space)
  
 
  # Inverse normalization (if the original data was normalized)
  avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
  avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")
  
avatars_tibble_knn10   <- as_tibble(avatars_rescaled) %>% 
    mutate(haplotype = round(haplotype, digits=0),
           cyp3A5D = round(cyp3A5D, digits=0),
           sexe_r  = round(sexe_r , digits=0),
           sexe_d  = round(sexe_d , digits=0),
           rejet_aigu  = round(rejet_aigu , digits=0),
           event = round(event, digits=0)
           # CYP3A4_1B = round(CYP3A4_1B, digits=0),
           # MDR1_C1236T = round(MDR1_C1236T, digits=0),
           # MDR1_G2677T = round(MDR1_G2677T, digits=0),
           # MDR1_C3435T = round(MDR1_C3435T, digits=0)
    ) 
  
avatars_tibble_factor_knn10   <- as_tibble(avatars_rescaled) %>% 
    mutate(haplotype = round(haplotype, digits=0),
           cyp3A5D = round(cyp3A5D, digits=0),
           sexe_r  = round(sexe_r , digits=0),
           sexe_d  = round(sexe_d , digits=0),
           rejet_aigu  = round(rejet_aigu , digits=0),
           event = round(event, digits=0)
           # CYP3A4_1B = round(CYP3A4_1B, digits=0),
           # MDR1_C1236T = round(MDR1_C1236T, digits=0),
           # MDR1_G2677T = round(MDR1_G2677T, digits=0),
           # MDR1_C3435T = round(MDR1_C3435T, digits=0)
    ) %>% 
    mutate(haplotype = as.factor(haplotype),
           cyp3A5D = as.factor(cyp3A5D),
           sexe_r = as.factor(sexe_r),
           sexe_d = as.factor(sexe_d),
           # CYP3A4_1B = as.factor(CYP3A4_1B),
           # MDR1_C1236T = as.factor(MDR1_C1236T),
           # MDR1_G2677T = as.factor(MDR1_G2677T),
           # MDR1_C3435T = as.factor(MDR1_C3435T),
           rejet_aigu = as.factor(rejet_aigu))

  
  # Finally, fit the Cox model
  fit <- coxph(Surv(delai_event, event) ~ haplotype , 
               data = avatars_tibble_knn10)

  # Calculate confidence intervals
  ci <- confint(fit)
  
  return(list(fit = fit, ci = ci))
}


extract_hrs_and_cis <- function(model_output) {
  coefs <- model_output$fit$coefficients
  ci <- model_output$ci

  hr <- exp(coefs)
  ci_lower <- exp(ci[,"2.5 %"])
  ci_upper <- exp(ci[,"97.5 %"])

  return(data.frame(variable = names(hr), hr = hr, ci_lower = ci_lower, ci_upper = ci_upper))
}

# Generate a list of seed values
seed_values <- sample(x=100) # Modify this if you need different seed values

# Apply the algorithm with different seed values
model_results <- map(seed_values, run_model_with_seed)

# Extract HR and CI from model results
extracted_results <- map(model_results, extract_hrs_and_cis)

# Combine results into a single data frame
combined_results <- bind_rows(extracted_results)

# Calculate median HR and CI for each variable
# aggregate_metrics <- combined_results %>%
#   group_by(variable) %>%
#   summarize(
#     median_hr = median(hr),
#     median_ci_lower = median(ci_lower),
#     median_ci_upper = median(ci_upper)
#   )
# 
# aggregate_metrics

# Calculate the specified percentiles for HRs for each variable
percentile_metrics <- combined_results %>%
  group_by(variable) %>%
  summarize(
    percentile_0 = quantile(hr, probs = 0),
    percentile_5 = quantile(hr, probs = 0.05),
    percentile_25 = quantile(hr, probs = 0.25),
    percentile_50 = quantile(hr, probs = 0.5),
    percentile_75 = quantile(hr, probs = 0.75),
    percentile_95 = quantile(hr, probs = 0.95),
    percentile_100 = quantile(hr, probs = 1)
  ) %>%
  pivot_longer(-variable, names_to = "Percentile_HR", values_to = "Value_HR") %>% 
  mutate(Value_HR = round(Value_HR, 2))
# percentile_metrics
# datatable(percentile_metrics)
knitr::kable(percentile_metrics %>%  mutate(Value_HR = round(Value_HR, 2)), "simple")
variable Percentile_HR Value_HR
haplotype percentile_0 2.40
haplotype percentile_5 3.98
haplotype percentile_25 5.41
haplotype percentile_50 6.43
haplotype percentile_75 8.20
haplotype percentile_95 12.84
haplotype percentile_100 21.61

data augmentation avatar

With 10 knn

We investigate the effect of data augmentaiotn with a defined seed and knn=10

data_augment_avatar <- function(x) {
data_normalized <- scale(original1)
pca <- prcomp(data_normalized, scale. = FALSE)# pour selecitonner le nombre de cp rank. = 3
# Number of neighbors
k <- 10  # Adjust this based on your requirement
pca_transformed_data <- pca$x
knn_result <- get.knn(pca_transformed_data, k)

generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
  n <- nrow(pca_transformed_data)
  avatar_weights <- matrix(nrow = n, ncol = k)
  
  for (i in 1:n) {
    # Step 1: Inverse of Distances
    distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
    inverse_distances <- 1 / distances
    
    # Step 2: Random Weights

    random_weights <- rexp(k, rate = 1)
    
    # Step 3: Contribution Factors
 
    shuffled_indices <- sample(k)
    contribution_factors <- 1 / (2^shuffled_indices)
    
    # Step 4: Calculate Weights
    weights <- inverse_distances * random_weights * contribution_factors
    
    # Step 5: Normalize Weights
    normalized_weights <- weights / sum(weights)
    
    avatar_weights[i, ] <- normalized_weights
  }
  
  return(avatar_weights)
}



# Generate avatar weights
   set.seed( str_c(1,x))
avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)

# Assuming pca_result, avatar_weights, knn_result$nn.index, and pca_transformed_data are already defined

# Function to generate avatars in PCA space based on weights
generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
  n <- nrow(pca_transformed_data)
  avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
  
  for (i in 1:n) {
    weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
    avatars_pca[i, ] <- colSums(weighted_avatars)
  }
  
  return(avatars_pca)
}
# Generate avatars in PCA space
avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)
# Assuming 'aids_pca' is the PCA object and 'avatars_pca_space' contains the avatars in PCA space
# Inverse PCA transformation
inverse_pca <- function(pca_object, pca_data) {
  return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
}
avatars_original_scale <- inverse_pca(pca, avatars_pca_space)

# Assuming 'aids_data_normalized' contains the scaling attributes of the original data
# Inverse normalization (if the original data was normalized)
avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")

avatars_tibble <- as_tibble(avatars_rescaled) %>% 
  mutate(haplotype = round(haplotype, digits=0),
         cyp3A5D = round(cyp3A5D, digits=0),
         sexe_r  = round(sexe_r , digits=0),
         sexe_d  = round(sexe_d , digits=0),
         rejet_aigu  = round(rejet_aigu , digits=0),
         event = round(event, digits=0)
         # CYP3A4_1B = round(CYP3A4_1B, digits=0),
         # MDR1_C1236T = round(MDR1_C1236T, digits=0),
         # MDR1_G2677T = round(MDR1_G2677T, digits=0),
         # MDR1_C3435T = round(MDR1_C3435T, digits=0)
  ) 
}

iteration <- c(1:4)

augmented_data_10 <- map_dfr(iteration, data_augment_avatar, .id = "iter_")

augmented_data_10 <- augmented_data_10 %>% 
  select(-iter_)

augmented_data_10_factor_knn10 <- augmented_data_10 %>% 
 mutate(haplotype = as.factor(haplotype),
         cyp3A5D = as.factor(cyp3A5D),
         sexe_r = as.factor(sexe_r),
         sexe_d = as.factor(sexe_d),
         # CYP3A4_1B = as.factor(CYP3A4_1B),
         # MDR1_C1236T = as.factor(MDR1_C1236T),
         # MDR1_G2677T = as.factor(MDR1_G2677T),
         # MDR1_C3435T = as.factor(MDR1_C3435T),
         rejet_aigu = as.factor(rejet_aigu))

Plot of the synthetic and original in the latent space

# Combine original and synthetic data for visualization
combined_data <- rbind(
  original1 %>% mutate(DataType = 'Original'),
  augmented_data_10 %>% mutate(DataType = 'Synthetic')
)

# Perform PCA on combined data
combined_data_normalized <- scale(combined_data[, -which(names(combined_data) %in% c("DataType", "id"))])
combined_pca <- prcomp(combined_data_normalized, scale. = FALSE)

# Extract the first two principal components
combined_pca_data <- data.frame(combined_pca$x[, 1:2])
combined_pca_data$DataType <- combined_data$DataType

# Plot PCA with color differentiation
ggplot(combined_pca_data, aes(x = PC1, y = PC2, color = DataType)) +
  geom_point(alpha = 0.5) +
  theme_minimal() +
  labs(title = "PCA Plot", x = "Principal Component 1", y = "Principal Component 2", color = "Data Type")

Export data augmented knn = 10

write_csv(augmented_data_10, file = "avatar_sfpt_knn10_data_augmented.csv")

Comparison of the datasets augmented and original

Summary of the 2 datasets

## Vector of categorical variables that need transformation
catVars <- c("haplotype", "cyp3A5D",  "sexe_r",  "sexe_d", 
"rejet_aigu", "event")
## Create a variable list.
vars <- c( "haplotype", "cyp3A5D", "age_r", "sexe_r", "age_d", "sexe_d", 
"rejet_aigu", "TIF", "event", "delai_event", "DataType")
tableOne <- CreateTableOne(vars = vars, strata = "DataType",factorVars = catVars, data = combined_data)
tableOne2<-print(tableOne, nonnormal = c( "age_r", "age_d", "TIF", "delai_event"), printToggle=F, minMax=T)
Original Synthetic p test
n 253 1012
haplotype (%) <0.001
1 97 (38.3) 356 ( 35.2)
2 123 (48.6) 602 ( 59.5)
3 33 (13.0) 54 ( 5.3)
cyp3A5D = 2 (%) 211 (83.4) 880 ( 87.0) 0.172
age_r (median [range]) 55.00 [19.00, 78.00] 54.41 [23.53, 75.26] 0.565 nonnorm
sexe_r = 2 (%) 156 (61.7) 666 ( 65.8) 0.244
age_d (median [range]) 40.00 [12.00, 73.00] 39.92 [15.46, 63.56] 0.683 nonnorm
sexe_d = 2 (%) 174 (68.8) 743 ( 73.4) 0.161
rejet_aigu = 2 (%) 81 (32.0) 276 ( 27.3) 0.155
TIF (median [range]) 1153.00 [303.00, 2580.00] 1141.45 [372.32, 2040.25] 0.473 nonnorm
event = 1 (%) 22 ( 8.7) 80 ( 7.9) 0.776
delai_event (median [range]) 5.34 [0.68, 15.83] 5.31 [0.87, 15.33] 0.756 nonnorm
DataType = Synthetic (%) 0 ( 0.0) 1012 (100.0) <0.001
summary(original)
##  haplotype   cyp3A5D       age_r       sexe_r      age_d       sexe_d 
##  autre: 97   Es : 42   Min.   :19.00   F: 97   Min.   :12.00   F: 79  
##  het  :123   NEs:211   1st Qu.:44.00   M:156   1st Qu.:25.00   M:174  
##  hom  : 33             Median :55.00           Median :40.00          
##                        Mean   :53.84           Mean   :38.49          
##                        3rd Qu.:64.00           3rd Qu.:49.00          
##                        Max.   :78.00           Max.   :73.00          
##  rejet_aigu      TIF           event          delai_event    
##  0:172      Min.   : 303   Min.   :0.00000   Min.   : 0.680  
##  1: 81      1st Qu.: 975   1st Qu.:0.00000   1st Qu.: 2.920  
##             Median :1153   Median :0.00000   Median : 5.340  
##             Mean   :1199   Mean   :0.08696   Mean   : 6.044  
##             3rd Qu.:1368   3rd Qu.:0.00000   3rd Qu.: 8.700  
##             Max.   :2580   Max.   :1.00000   Max.   :15.830
summary(augmented_data_10)
##    haplotype        cyp3A5D         age_r           sexe_r          age_d      
##  Min.   :1.000   Min.   :1.00   Min.   :23.53   Min.   :1.000   Min.   :15.46  
##  1st Qu.:1.000   1st Qu.:2.00   1st Qu.:45.97   1st Qu.:1.000   1st Qu.:29.24  
##  Median :2.000   Median :2.00   Median :54.41   Median :2.000   Median :39.92  
##  Mean   :1.702   Mean   :1.87   Mean   :53.53   Mean   :1.658   Mean   :38.81  
##  3rd Qu.:2.000   3rd Qu.:2.00   3rd Qu.:62.10   3rd Qu.:2.000   3rd Qu.:47.25  
##  Max.   :3.000   Max.   :2.00   Max.   :75.26   Max.   :2.000   Max.   :63.56  
##      sexe_d        rejet_aigu         TIF             event        
##  Min.   :1.000   Min.   :1.000   Min.   : 372.3   Min.   :0.00000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1029.4   1st Qu.:0.00000  
##  Median :2.000   Median :1.000   Median :1141.4   Median :0.00000  
##  Mean   :1.734   Mean   :1.273   Mean   :1163.0   Mean   :0.07905  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1279.4   3rd Qu.:0.00000  
##  Max.   :2.000   Max.   :2.000   Max.   :2040.3   Max.   :1.00000  
##   delai_event     
##  Min.   : 0.8704  
##  1st Qu.: 3.8048  
##  Median : 5.3123  
##  Mean   : 5.7545  
##  3rd Qu.: 7.1851  
##  Max.   :15.3301

individual data explorer

# boxplots
plot_boxplot(combined_data , by ="DataType") 

# histograms

# Function to create histogram for each continuous variable
plot_histograms <- function(data, var_name, group_var) {
  ggplot(data, aes(x = !!sym(var_name), fill = !!sym(group_var))) +
    geom_histogram(alpha = 0.5,show.legend = FALSE) +
    labs(x = var_name, y = "Count") +
    theme_minimal() +
    ggtitle(paste(var_name))
}

# Using select_if to identify continuous variables and map to apply the function
plots <- combined_data %>%
  select( -sexe_r,-sexe_d) %>% 
  select_if(is.numeric) %>%
  names() %>%
  map(~plot_histograms(combined_data, ., "DataType"))

# Optionally, print or arrange plots (e.g., using gridExtra or patchwork packages)

wrap_plots(plots)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot correlation

##Correlation Analysis
  cor_real <- cor(original1, use = "complete.obs")
  cor_synthetic <- cor(augmented_data_10, use = "complete.obs")
  
# plots
ggcorrplot(cor_real, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

# plots
ggcorrplot(cor_synthetic, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

Modele de Cox

# original
fit_original <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = original1)
## 
##   n= 253, number of events= 22 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.2112071  3.3575352  0.3463273  3.497  0.00047 ***
## cyp3A5D    -1.2323909  0.2915946  0.5567303 -2.214  0.02685 *  
## age_r      -0.0039521  0.9960557  0.0187880 -0.210  0.83339    
## sexe_r     -0.0438849  0.9570641  0.4606422 -0.095  0.92410    
## age_d       0.0360206  1.0366772  0.0203668  1.769  0.07696 .  
## sexe_d      0.2786636  1.3213627  0.5181402  0.538  0.59070    
## rejet_aigu  1.0124644  2.7523756  0.4804379  2.107  0.03508 *  
## TIF        -0.0002268  0.9997732  0.0005753 -0.394  0.69345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.3575     0.2978   1.70305    6.6193
## cyp3A5D       0.2916     3.4294   0.09792    0.8683
## age_r         0.9961     1.0040   0.96004    1.0334
## sexe_r        0.9571     1.0449   0.38801    2.3607
## age_d         1.0367     0.9646   0.99611    1.0789
## sexe_d        1.3214     0.7568   0.47861    3.6481
## rejet_aigu    2.7524     0.3633   1.07339    7.0576
## TIF           0.9998     1.0002   0.99865    1.0009
## 
## Concordance= 0.758  (se = 0.054 )
## Likelihood ratio test= 24.6  on 8 df,   p=0.002
## Wald test            = 21.09  on 8 df,   p=0.007
## Score (logrank) test = 25.84  on 8 df,   p=0.001
ggforest(fit_original)

# synthetique
fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = augmented_data_10)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = augmented_data_10)
## 
##   n= 1012, number of events= 80 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.5000227  4.4817906  0.2358547  6.360 2.02e-10 ***
## cyp3A5D    -0.4037335  0.6678220  0.4804159 -0.840 0.400693    
## age_r       0.0101762  1.0102281  0.0124915  0.815 0.415273    
## sexe_r      0.2129631  1.2373389  0.2734454  0.779 0.436089    
## age_d       0.0593288  1.0611240  0.0148048  4.007 6.14e-05 ***
## sexe_d      1.2160189  3.3737299  0.3497060  3.477 0.000507 ***
## rejet_aigu  0.8153153  2.2598881  0.2965094  2.750 0.005965 ** 
## TIF        -0.0002206  0.9997794  0.0005277 -0.418 0.675868    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     4.4818     0.2231    2.8229     7.116
## cyp3A5D       0.6678     1.4974    0.2605     1.712
## age_r         1.0102     0.9899    0.9858     1.035
## sexe_r        1.2373     0.8082    0.7240     2.115
## age_d         1.0611     0.9424    1.0308     1.092
## sexe_d        3.3737     0.2964    1.7000     6.695
## rejet_aigu    2.2599     0.4425    1.2639     4.041
## TIF           0.9998     1.0002    0.9987     1.001
## 
## Concordance= 0.784  (se = 0.027 )
## Likelihood ratio test= 112.6  on 8 df,   p=<2e-16
## Wald test            = 98.87  on 8 df,   p=<2e-16
## Score (logrank) test = 124.7  on 8 df,   p=<2e-16
ggforest(fit_synthetique)

BootstepAIC augmented synthetic knn10

boot.stepAIC(fit_synthetique, augmented_data_10, B = 100, k=log(nrow(augmented_data_10)))
## 
## Summary of Bootstrapping the 'stepAIC()' procedure for
## 
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = augmented_data_10)
## 
## Bootstrap samples: 100 
## Direction: backward 
## Penalty: 6.92 * df
## 
## Covariates selected
##            (%)
## haplotype  100
## age_d       95
## sexe_d      91
## rejet_aigu  53
## age_r        2
## cyp3A5D      1
## sexe_r       1
## TIF          1
## 
## Coefficients Sign
##            + (%) - (%)
## age_d        100     0
## age_r        100     0
## haplotype    100     0
## rejet_aigu   100     0
## sexe_d       100     0
## sexe_r       100     0
## cyp3A5D        0   100
## TIF            0   100
## 
## Stat Significance
##            (%)
## age_d      100
## age_r      100
## cyp3A5D    100
## haplotype  100
## rejet_aigu 100
## sexe_d     100
## sexe_r     100
## TIF        100
## 
## 
## The stepAIC() for the original data-set gave
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + age_d + 
##     sexe_d + rejet_aigu, data = augmented_data_10)
## 
##               coef exp(coef) se(coef)     z        p
## haplotype  1.48573   4.41817  0.21748 6.832 8.40e-12
## age_d      0.06233   1.06431  0.01411 4.418 9.97e-06
## sexe_d     1.12534   3.08126  0.33887 3.321 0.000897
## rejet_aigu 0.75888   2.13589  0.28913 2.625 0.008673
## 
## Likelihood ratio test=111.1  on 4 df, p=< 2.2e-16
## n= 1012, number of events= 80 
## 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_r + sexe_r + 
##     age_d + sexe_d + rejet_aigu + TIF
## 
## Final Model:
## Surv(delai_event, event) ~ haplotype + age_d + sexe_d + rejet_aigu
## 
## 
##        Step Df  Deviance Resid. Df Resid. Dev      AIC
## 1                               72  -112.6018 850.2685
## 2     - TIF  1 0.1770448        73  -112.4248 843.5258
## 3 - cyp3A5D  1 0.5888032        74  -111.8360 837.1949
## 4   - age_r  1 0.5216025        75  -111.3144 830.7969
## 5  - sexe_r  1 0.2104665        76  -111.1039 824.0876

Final model original

fit_original <- coxph(Surv(delai_event, event) ~ haplotype + rejet_aigu , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + rejet_aigu, 
##     data = original1)
## 
##   n= 253, number of events= 22 
## 
##              coef exp(coef) se(coef)     z Pr(>|z|)    
## haplotype  1.1681    3.2160   0.3261 3.582 0.000341 ***
## rejet_aigu 0.9238    2.5188   0.4661 1.982 0.047482 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype      3.216     0.3109     1.697     6.094
## rejet_aigu     2.519     0.3970     1.010     6.280
## 
## Concordance= 0.732  (se = 0.05 )
## Likelihood ratio test= 18.24  on 2 df,   p=1e-04
## Wald test            = 17.44  on 2 df,   p=2e-04
## Score (logrank) test = 19.29  on 2 df,   p=6e-05

Final model synthetic

fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D + age_d + rejet_aigu, data = augmented_data_10)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_d + rejet_aigu, data = augmented_data_10)
## 
##   n= 1012, number of events= 80 
## 
##                coef exp(coef) se(coef)      z Pr(>|z|)    
## haplotype   1.40489   4.07506  0.21436  6.554 5.61e-11 ***
## cyp3A5D    -0.23650   0.78939  0.47551 -0.497 0.618938    
## age_d       0.04935   1.05058  0.01412  3.494 0.000475 ***
## rejet_aigu  1.04938   2.85587  0.27963  3.753 0.000175 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     4.0751     0.2454    2.6771     6.203
## cyp3A5D       0.7894     1.2668    0.3108     2.005
## age_d         1.0506     0.9519    1.0219     1.080
## rejet_aigu    2.8559     0.3502    1.6509     4.940
## 
## Concordance= 0.777  (se = 0.025 )
## Likelihood ratio test= 97.69  on 4 df,   p=<2e-16
## Wald test            = 87.65  on 4 df,   p=<2e-16
## Score (logrank) test = 98.98  on 4 df,   p=<2e-16

Modele final & KM

km_original <- survfit(Surv(delai_event, event) ~ haplotype, data = original)
ggsurvplot(
  km_original,
  data = original,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

km_synthetique <- survfit(Surv(delai_event, event) ~ haplotype, data = augmented_data_10_factor_knn10)
km_synthetique_avatar_10_augmented <- ggsurvplot(
  km_synthetique,
  data = augmented_data_10_factor_knn10,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)
km_synthetique_avatar_10_augmented

Plots original & synthetic combined

## combine data
combined_df <- rbind(original %>% mutate(group = "original"), augmented_data_10_factor_knn10 %>% mutate(group = "synthetic")) %>% mutate(combined_haplotype = str_c(haplotype,"_", group ))

## fit the model
km_combined <- survfit(Surv(delai_event, event) ~ combined_haplotype, data = combined_df)

# plot
ggsurvplot(fit = km_combined, 
           data = combined_df,
           
  size = 1,                 # change line size
  conf.int = FALSE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.35, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

Graphical exploraiotn of distribution

library(GGally)

pm_knn10_augmented <- combined_df %>% select(haplotype:delai_event, group) %>% ggpairs(
  ggplot2::aes(colour = group,alpha = 0.5),
  upper = list(continuous = wrap("cor", size = 1.5)),
  lower=list(combo=wrap("facethist", binwidth=0.5))) + 
  theme(strip.text.x = element_text(size = 5),
           strip.text.y = element_text(size = 5),axis.text = element_text(size = 5))
pm_knn10_augmented

# ggsave("comparaison_distribution_augmented_knn10.pdf")

Bootstrap of the coefficient for haplotype

Allow to define the variability range of HR for a given dataset (intra dataset variability)

# Define the Cox model
cox_model <- function(data, indices) {
  d <- data[indices,] # allows bootstrapping to sample the data
  fit <- coxph(Surv(delai_event, event) ~ haplotype + age_d + sexe_d + rejet_aigu , data=d)
  return(fit$coefficients)
}

# Set the seed for reproducibility
set.seed(12)

# Bootstrap the Cox model
boot_results <- boot(data=augmented_data_10, statistic=cox_model, R=100)

# Convert bootstrap results to a data frame for ggplot2
boot_hrs <- exp(boot_results$t) # Convert log(HR) to HR
hr_data_haplo <- data.frame(HR=boot_hrs[,1])
hr_data_age_d <- data.frame(HR=boot_hrs[,2])
hr_data_cyp3A5D <- data.frame(HR=boot_hrs[,3])
hr_data_rejet_aigu <- data.frame(HR=boot_hrs[,4])
# Calculate summary statistics
summary_stats <- quantile(hr_data_haplo$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) %>%  
  bind_rows(quantile(hr_data_age_d$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) ) %>% 
  bind_rows(quantile(hr_data_cyp3A5D$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) ) %>% 
  bind_rows(quantile(hr_data_rejet_aigu$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) )  
names(summary_stats) <- c("Min","2.5th", "5th", "25th", "Median", "75th", "95th","97.5th","Max")

# Create the histogram
ggplot(hr_data_haplo, aes(x=HR)) +
  geom_histogram(bins=30, fill="#007a86", color="black") +
  # # geom_vline(aes(xintercept=summary_stats["Min"]), color="red", linetype="dashed") +
  # geom_vline(aes(xintercept=summary_stats["25th"][[1]][[1]]), color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Median"][[1]][[1]]), color="blue", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["75th"])[[1]][[1]], color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Max"]), color="purple", linetype="dashed") +
  labs(title="Bootstrap Distribution of Hazard Ratios", x="Hazard Ratio (HR)", y="Frequency") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

# Print summary statistics
knitr::kable(summary_stats, "simple")
Min 2.5th 5th 25th Median 75th 95th 97.5th Max
3.022779 3.315683 3.576255 4.051954 4.604895 5.091066 6.488926 7.449832 8.648328
1.029248 1.039455 1.043175 1.053725 1.064096 1.073754 1.085664 1.090181 1.099419
1.550175 1.790092 1.846553 2.601247 3.116408 4.110787 5.926792 6.620560 10.414952
1.190996 1.293889 1.349926 1.724742 2.148875 2.651489 3.993213 4.209455 5.695346

inter seed vairbaility augmented knn10

Allow to define the inter variability range of HR for augmented knn=10 (inter dataset variability) by using 100 bootraps

# Assuming all your existing functions and necessary libraries are loaded

run_model_with_seed <- function(seed_value) {
  
  
  # augmentaiotn of data
  
  data_augment_avatar <- function(x) {
    data_normalized <- scale(original1)
    pca <- prcomp(data_normalized, scale. = FALSE)# pour selecitonner le nombre de cp rank. = 3
    # Number of neighbors
    k <- 10 # Adjust this based on your requirement
    pca_transformed_data <- pca$x
    knn_result <- get.knn(pca_transformed_data, k)
    
    generate_avatar_weights <- function(knn_result, pca_transformed_data, k) {
      n <- nrow(pca_transformed_data)
      avatar_weights <- matrix(nrow = n, ncol = k)
      
      for (i in 1:n) {
        # Step 1: Inverse of Distances
        distances <- sqrt(rowSums((pca_transformed_data[knn_result$nn.index[i, ], ] - pca_transformed_data[i, ])^2))
        inverse_distances <- 1 / distances
        
        # Step 2: Random Weights
        
        random_weights <- rexp(k, rate = 1)
        
        # Step 3: Contribution Factors
        
        shuffled_indices <- sample(k)
        contribution_factors <- 1 / (2^shuffled_indices)
        
        # Step 4: Calculate Weights
        weights <- inverse_distances * random_weights * contribution_factors
        
        # Step 5: Normalize Weights
        normalized_weights <- weights / sum(weights)
        
        avatar_weights[i, ] <- normalized_weights
      }
      
      return(avatar_weights)
    }
    
    
    
    # Generate avatar weights
 
    avatar_weights <- generate_avatar_weights(knn_result, pca_transformed_data, k)
    
    # Assuming pca_result, avatar_weights, knn_result$nn.index, and pca_transformed_data are already defined
    
    # Function to generate avatars in PCA space based on weights
    generate_avatars_pca_space <- function(pca_transformed_data, knn_indices, weights) {
      n <- nrow(pca_transformed_data)
      avatars_pca <- matrix(nrow = n, ncol = ncol(pca_transformed_data))
      
      for (i in 1:n) {
        weighted_avatars <- pca_transformed_data[knn_indices[i, ], ] * weights[i, ]
        avatars_pca[i, ] <- colSums(weighted_avatars)
      }
      
      return(avatars_pca)
    }
    # Generate avatars in PCA space
    avatars_pca_space <- generate_avatars_pca_space(pca_transformed_data, knn_result$nn.index, avatar_weights)
    # Assuming 'aids_pca' is the PCA object and 'avatars_pca_space' contains the avatars in PCA space
    # Inverse PCA transformation
    inverse_pca <- function(pca_object, pca_data) {
      return(pca_data %*% t(pca_object$rotation) + matrix(pca_object$center, nrow = nrow(pca_data), ncol = ncol(pca_object$rotation), byrow = TRUE))
    }
    avatars_original_scale <- inverse_pca(pca, avatars_pca_space)
    
    # Assuming 'aids_data_normalized' contains the scaling attributes of the original data
    # Inverse normalization (if the original data was normalized)
    avatars_rescaled <- scale(avatars_original_scale, center = FALSE, scale = 1/attr(data_normalized, "scaled:scale"))
    avatars_rescaled <- sweep(avatars_rescaled, 2, attr(data_normalized, "scaled:center"), "+")
    
    avatars_tibble <- as_tibble(avatars_rescaled) %>% 
      mutate(haplotype = round(haplotype, digits=0),
             cyp3A5D = round(cyp3A5D, digits=0),
             sexe_r  = round(sexe_r , digits=0),
             sexe_d  = round(sexe_d , digits=0),
             rejet_aigu  = round(rejet_aigu , digits=0),
             event = round(event, digits=0)
             # CYP3A4_1B = round(CYP3A4_1B, digits=0),
             # MDR1_C1236T = round(MDR1_C1236T, digits=0),
             # MDR1_G2677T = round(MDR1_G2677T, digits=0),
             # MDR1_C3435T = round(MDR1_C3435T, digits=0)
      ) 
  }
  
  iteration <- c(1:4)
  set.seed(seed_value)
  augmented_data_x <- map_dfr(iteration, data_augment_avatar, .id = "iter_")
  
  augmented_data_x <- augmented_data_x %>% 
    select(-iter_)
  
  
  ###############
  # Finally, fit the Cox model
  fit <- coxph(Surv(delai_event, event) ~ haplotype + age_d + sexe_d + rejet_aigu, 
               data = augmented_data_x)
  coefs <- fit$coefficients
  hr <- exp(coefs)
  return(data.frame(variable = names(hr), hr = hr))
  # Calculate confidence intervals
#  ci <- confint(fit)
  
 # return(list(fit = fit, ci = ci))
}




# Generate a list of seed values
seed_value <- sample(x=100) # Modify this if you need different seed values

# Apply the algorithm with different seed values
model_results <- map(seed_value, run_model_with_seed)

# Extract HR and CI from model results
#extracted_results <- map(model_results, extract_hrs_and_cis)

# Combine results into a single data frame
combined_results <- bind_rows(model_results)

# Calculate median HR and CI for each variable
# aggregate_metrics <- combined_results %>%
#   group_by(variable) %>%
#   summarize(
#     median_hr = median(hr),
#     median_ci_lower = median(ci_lower),
#     median_ci_upper = median(ci_upper)
#   )
# 
# aggregate_metrics

# Calculate the specified percentiles for HRs for each variable
percentile_metrics <- combined_results %>%
  group_by(variable) %>%
  summarize(
    percentile_0 = quantile(hr, probs = 0),
    percentile_5 = quantile(hr, probs = 0.05),
    percentile_25 = quantile(hr, probs = 0.25),
    percentile_50 = quantile(hr, probs = 0.5),
    percentile_75 = quantile(hr, probs = 0.75),
    percentile_95 = quantile(hr, probs = 0.95),
    percentile_100 = quantile(hr, probs = 1)
  ) %>%
  pivot_longer(-variable, names_to = "Percentile_HR", values_to = "Value_HR") %>% 
  mutate(Value_HR = round(Value_HR, 2))
# percentile_metrics
# datatable(percentile_metrics)
knitr::kable(percentile_metrics %>%  mutate(Value_HR = round(Value_HR, 2)), "simple")
variable Percentile_HR Value_HR
age_d percentile_0 1.01
age_d percentile_5 1.02
age_d percentile_25 1.03
age_d percentile_50 1.04
age_d percentile_75 1.05
age_d percentile_95 1.07
age_d percentile_100 1.08
haplotype percentile_0 3.74
haplotype percentile_5 4.24
haplotype percentile_25 4.79
haplotype percentile_50 5.30
haplotype percentile_75 5.93
haplotype percentile_95 7.76
haplotype percentile_100 10.34
rejet_aigu percentile_0 1.48
rejet_aigu percentile_5 1.84
rejet_aigu percentile_25 2.65
rejet_aigu percentile_50 3.26
rejet_aigu percentile_75 3.83
rejet_aigu percentile_95 4.78
rejet_aigu percentile_100 5.95
sexe_d percentile_0 1.09
sexe_d percentile_5 1.50
sexe_d percentile_25 1.88
sexe_d percentile_50 2.21
sexe_d percentile_75 2.64
sexe_d percentile_95 3.58
sexe_d percentile_100 8.48

Survival Variational Auto-Encoder (surVAE)

These data have been generated by Clement Benoist using the Synthcity python libnrary from the Van der Schaar lab

Non augmented data

load the data

survae <- read_csv("sfpt24_survae_data_v240111.dat") %>% 
   select(haplotype:delai_event)
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
survae_factor <- survae %>% 
  mutate(haplotype = as.factor(haplotype),
         cyp3A5D = as.factor(cyp3A5D),
         sexe_r = as.factor(sexe_r),
         sexe_d = as.factor(sexe_d),
         # CYP3A4_1B = as.factor(CYP3A4_1B),
         # MDR1_C1236T = as.factor(MDR1_C1236T),
         # MDR1_G2677T = as.factor(MDR1_G2677T),
         # MDR1_C3435T = as.factor(MDR1_C3435T),
         rejet_aigu = as.factor(rejet_aigu))

Comparison of the datasets

Summary of the 2 datasets

summary(original)
##  haplotype   cyp3A5D       age_r       sexe_r      age_d       sexe_d 
##  autre: 97   Es : 42   Min.   :19.00   F: 97   Min.   :12.00   F: 79  
##  het  :123   NEs:211   1st Qu.:44.00   M:156   1st Qu.:25.00   M:174  
##  hom  : 33             Median :55.00           Median :40.00          
##                        Mean   :53.84           Mean   :38.49          
##                        3rd Qu.:64.00           3rd Qu.:49.00          
##                        Max.   :78.00           Max.   :73.00          
##  rejet_aigu      TIF           event          delai_event    
##  0:172      Min.   : 303   Min.   :0.00000   Min.   : 0.680  
##  1: 81      1st Qu.: 975   1st Qu.:0.00000   1st Qu.: 2.920  
##             Median :1153   Median :0.00000   Median : 5.340  
##             Mean   :1199   Mean   :0.08696   Mean   : 6.044  
##             3rd Qu.:1368   3rd Qu.:0.00000   3rd Qu.: 8.700  
##             Max.   :2580   Max.   :1.00000   Max.   :15.830
summary(survae)
##    haplotype        cyp3A5D          age_r           sexe_r     
##  Min.   :1.000   Min.   :1.000   Min.   :20.00   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:42.00   1st Qu.:1.000  
##  Median :2.000   Median :2.000   Median :56.00   Median :2.000  
##  Mean   :1.862   Mean   :1.941   Mean   :53.49   Mean   :1.644  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:64.00   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :2.000   Max.   :78.00   Max.   :2.000  
##      age_d           sexe_d        rejet_aigu         TIF      
##  Min.   :13.00   Min.   :1.000   Min.   :1.000   Min.   : 588  
##  1st Qu.:21.00   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 896  
##  Median :34.00   Median :2.000   Median :1.000   Median :1057  
##  Mean   :32.97   Mean   :1.779   Mean   :1.292   Mean   :1066  
##  3rd Qu.:44.00   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1188  
##  Max.   :60.00   Max.   :2.000   Max.   :2.000   Max.   :1912  
##      event          delai_event    
##  Min.   :0.00000   Min.   : 1.067  
##  1st Qu.:0.00000   1st Qu.: 4.513  
##  Median :0.00000   Median : 6.130  
##  Mean   :0.05534   Mean   : 6.307  
##  3rd Qu.:0.00000   3rd Qu.: 7.733  
##  Max.   :1.00000   Max.   :14.883
# Combine original and synthetic data for visualization
combined_data <- rbind(
  original1 %>% mutate(DataType = 'Original'),
  survae_factor %>% mutate(DataType = 'Synthetic')
) %>% mutate_if(is.character, factor)
## Vector of categorical variables that need transformation
catVars <- c("haplotype", "cyp3A5D",  "sexe_r",  "sexe_d", 
"rejet_aigu", "event")
## Create a variable list.
vars <- c( "haplotype", "cyp3A5D", "age_r", "sexe_r", "age_d", "sexe_d", 
"rejet_aigu", "TIF", "event", "delai_event", "DataType")
tableOne <- CreateTableOne(vars = vars, strata = "DataType",factorVars = catVars, data = combined_data)
tableOne2<-print(tableOne, nonnormal = c( "age_r", "age_d", "TIF", "delai_event"), printToggle=F, minMax=T)
Original Synthetic p test
n 253 253
haplotype (%) 0.047
1 97 (38.3) 71 ( 28.1)
2 123 (48.6) 146 ( 57.7)
3 33 (13.0) 36 ( 14.2)
cyp3A5D = 2 (%) 211 (83.4) 238 ( 94.1) <0.001
age_r (median [range]) 55.00 [19.00, 78.00] 56.00 [20.00, 78.00] 0.820 nonnorm
sexe_r = 2 (%) 156 (61.7) 163 ( 64.4) 0.581
age_d (median [range]) 40.00 [12.00, 73.00] 34.00 [13.00, 60.00] <0.001 nonnorm
sexe_d = 2 (%) 174 (68.8) 197 ( 77.9) 0.027
rejet_aigu = 2 (%) 81 (32.0) 74 ( 29.2) 0.563
TIF (median [range]) 1153.00 [303.00, 2580.00] 1057.00 [588.00, 1912.00] <0.001 nonnorm
event = 1 (%) 22 ( 8.7) 14 ( 5.5) 0.226
delai_event (median [range]) 5.34 [0.68, 15.83] 6.13 [1.07, 14.88] 0.017 nonnorm
DataType = Synthetic (%) 0 ( 0.0) 253 (100.0) <0.001

individual data explorer

# boxplots
plot_boxplot(combined_data , by ="DataType") 

# histograms

# Function to create histogram for each continuous variable
plot_histograms <- function(data, var_name, group_var) {
  ggplot(data, aes(x = !!sym(var_name), fill = !!sym(group_var))) +
    geom_histogram(alpha = 0.5,show.legend = FALSE) +
    labs(x = var_name, y = "Count") +
    theme_minimal() +
    ggtitle(paste(var_name))
}

# Using select_if to identify continuous variables and map to apply the function
plots <- combined_data %>%
  select( -sexe_r,-sexe_d) %>% 
  select_if(is.numeric) %>%
  names() %>%
  map(~plot_histograms(combined_data, ., "DataType"))

# Optionally, print or arrange plots (e.g., using gridExtra or patchwork packages)

wrap_plots(plots)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot correlation

##Correlation Analysis
  cor_real <- cor(original1, use = "complete.obs")
  cor_synthetic <- cor(survae, use = "complete.obs")
  
# plots
ggcorrplot(cor_real, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

# plots
ggcorrplot(cor_synthetic, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

Modele de Cox

# original
fit_original <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = original1)
## 
##   n= 253, number of events= 22 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.2112071  3.3575352  0.3463273  3.497  0.00047 ***
## cyp3A5D    -1.2323909  0.2915946  0.5567303 -2.214  0.02685 *  
## age_r      -0.0039521  0.9960557  0.0187880 -0.210  0.83339    
## sexe_r     -0.0438849  0.9570641  0.4606422 -0.095  0.92410    
## age_d       0.0360206  1.0366772  0.0203668  1.769  0.07696 .  
## sexe_d      0.2786636  1.3213627  0.5181402  0.538  0.59070    
## rejet_aigu  1.0124644  2.7523756  0.4804379  2.107  0.03508 *  
## TIF        -0.0002268  0.9997732  0.0005753 -0.394  0.69345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.3575     0.2978   1.70305    6.6193
## cyp3A5D       0.2916     3.4294   0.09792    0.8683
## age_r         0.9961     1.0040   0.96004    1.0334
## sexe_r        0.9571     1.0449   0.38801    2.3607
## age_d         1.0367     0.9646   0.99611    1.0789
## sexe_d        1.3214     0.7568   0.47861    3.6481
## rejet_aigu    2.7524     0.3633   1.07339    7.0576
## TIF           0.9998     1.0002   0.99865    1.0009
## 
## Concordance= 0.758  (se = 0.054 )
## Likelihood ratio test= 24.6  on 8 df,   p=0.002
## Wald test            = 21.09  on 8 df,   p=0.007
## Score (logrank) test = 25.84  on 8 df,   p=0.001
ggforest(fit_original, data = original)

# synthetique
fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = survae)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = survae)
## 
##   n= 253, number of events= 14 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)
## haplotype   0.7102441  2.0344878  0.4369992  1.625    0.104
## cyp3A5D    -0.6508903  0.5215812  1.1378964 -0.572    0.567
## age_r       0.0249004  1.0252130  0.0277982  0.896    0.370
## sexe_r     -0.1058940  0.8995200  0.5575219 -0.190    0.849
## age_d       0.0387958  1.0395581  0.0264003  1.470    0.142
## sexe_d     -0.3278550  0.7204675  0.6301131 -0.520    0.603
## rejet_aigu  0.2023881  1.2243231  0.5851686  0.346    0.729
## TIF         0.0006969  1.0006972  0.0010851  0.642    0.521
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     2.0345     0.4915   0.86393     4.791
## cyp3A5D       0.5216     1.9172   0.05607     4.852
## age_r         1.0252     0.9754   0.97085     1.083
## sexe_r        0.8995     1.1117   0.30161     2.683
## age_d         1.0396     0.9619   0.98714     1.095
## sexe_d        0.7205     1.3880   0.20954     2.477
## rejet_aigu    1.2243     0.8168   0.38887     3.855
## TIF           1.0007     0.9993   0.99857     1.003
## 
## Concordance= 0.66  (se = 0.088 )
## Likelihood ratio test= 6.99  on 8 df,   p=0.5
## Wald test            = 6.69  on 8 df,   p=0.6
## Score (logrank) test = 6.94  on 8 df,   p=0.5
ggforest(fit_synthetique)

BootstepAIC synhtetic survae

boot.stepAIC(fit_synthetique, survae, B = 100, k=log(nrow(survae)))
## 
## Summary of Bootstrapping the 'stepAIC()' procedure for
## 
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = survae)
## 
## Bootstrap samples: 100 
## Direction: backward 
## Penalty: 5.53 * df
## 
## Covariates selected
##            (%)
## Null        56
## age_d       21
## haplotype   19
## TIF         10
## sexe_d       7
## cyp3A5D      6
## rejet_aigu   3
## sexe_r       3
## age_r        2
## 
## Coefficients Sign
##             + (%)  - (%)
## haplotype  100.00   0.00
## rejet_aigu 100.00   0.00
## age_d       95.24   4.76
## TIF         80.00  20.00
## age_r       50.00  50.00
## sexe_r      33.33  66.67
## sexe_d      14.29  85.71
## cyp3A5D      0.00 100.00
## 
## Stat Significance
##               (%)
## age_d      100.00
## age_r      100.00
## haplotype  100.00
## rejet_aigu 100.00
## sexe_r     100.00
## TIF        100.00
## sexe_d      85.71
## cyp3A5D     83.33
## 
## 
## The stepAIC() for the original data-set gave
## Call:  coxph(formula = Surv(delai_event, event) ~ 1, data = survae)
## 
## Null model
##   log likelihood= -63.40256 
##   n= 253 
## 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_r + sexe_r + 
##     age_d + sexe_d + rejet_aigu + TIF
## 
## Final Model:
## Surv(delai_event, event) ~ 1
## 
## 
##           Step Df   Deviance Resid. Df Resid. Dev      AIC
## 1                                    6  -6.990853 164.0814
## 2     - sexe_r  1 0.03588011         7  -6.954973 158.5839
## 3 - rejet_aigu  1 0.11582350         8  -6.839149 153.1663
## 4     - sexe_d  1 0.25862268         9  -6.580527 147.8915
## 5    - cyp3A5D  1 0.33049617        10  -6.250031 142.6886
## 6        - TIF  1 0.42154575        11  -5.828485 137.5768
## 7      - age_r  1 0.90884864        12  -4.919636 132.9523
## 8  - haplotype  1 2.49230935        13  -2.427327 129.9112
## 9      - age_d  1 2.42732679        14   0.000000 126.8051

Final model original

fit_original <- coxph(Surv(delai_event, event) ~ haplotype +    rejet_aigu , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + rejet_aigu, 
##     data = original1)
## 
##   n= 253, number of events= 22 
## 
##              coef exp(coef) se(coef)     z Pr(>|z|)    
## haplotype  1.1681    3.2160   0.3261 3.582 0.000341 ***
## rejet_aigu 0.9238    2.5188   0.4661 1.982 0.047482 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype      3.216     0.3109     1.697     6.094
## rejet_aigu     2.519     0.3970     1.010     6.280
## 
## Concordance= 0.732  (se = 0.05 )
## Likelihood ratio test= 18.24  on 2 df,   p=1e-04
## Wald test            = 17.44  on 2 df,   p=2e-04
## Score (logrank) test = 19.29  on 2 df,   p=6e-05

Final model synthetic

fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + rejet_aigu , data = survae)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + rejet_aigu, 
##     data = survae)
## 
##   n= 253, number of events= 14 
## 
##              coef exp(coef) se(coef)     z Pr(>|z|)
## haplotype  0.5636    1.7569   0.4190 1.345    0.179
## rejet_aigu 0.2708    1.3111   0.5664 0.478    0.633
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype      1.757     0.5692    0.7728     3.994
## rejet_aigu     1.311     0.7627    0.4320     3.979
## 
## Concordance= 0.581  (se = 0.081 )
## Likelihood ratio test= 1.88  on 2 df,   p=0.4
## Wald test            = 1.88  on 2 df,   p=0.4
## Score (logrank) test = 1.9  on 2 df,   p=0.4

Bootstrap of the coefficient for haplotype

Allow to define the variability range of HR for a given dataset (intra dataset variability)

# Set the seed for reproducibility
set.seed(12)

# Bootstrap the Cox model
boot_results <- boot(data=survae, statistic=cox_model, R=100)

# Convert bootstrap results to a data frame for ggplot2
boot_hrs <- exp(boot_results$t) # Convert log(HR) to HR
hr_data <- data.frame(HR=boot_hrs[,1])

# Calculate summary statistics
summary_stats <- quantile(hr_data$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1))
names(summary_stats) <- c("Min","2.5th", "5th", "25th", "Median", "75th", "95th","97.5th","Max")

# Create the histogram
# Create the histogram
ggplot(hr_data, aes(x=HR)) +
  geom_histogram(bins=30, fill="#007a86", color="black") +
  # # geom_vline(aes(xintercept=summary_stats["Min"]), color="red", linetype="dashed") +
  # geom_vline(aes(xintercept=summary_stats["25th"][[1]][[1]]), color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Median"][[1]][[1]]), color="blue", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["75th"])[[1]][[1]], color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Max"]), color="purple", linetype="dashed") +
  labs(title="Bootstrap Distribution of Hazard Ratios", x="Hazard Ratio (HR)", y="Frequency") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

# Print summary statistics
knitr::kable(summary_stats, "simple")
x
Min 0.7351723
2.5th 0.8378569
5th 0.8928419
25th 1.4116339
Median 1.8234747
75th 2.5604567
95th 4.1124517
97.5th 4.6540383
Max 5.2444288

Modele final & KM

km_original <- survfit(Surv(delai_event, event) ~ haplotype, data = original)
ggsurvplot(
  km_original,
  data = original,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

km_synthetique <- survfit(Surv(delai_event, event) ~ haplotype, data = survae_factor)
km_synthetique_survae <- ggsurvplot(
  km_synthetique,
  data = survae_factor,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)
km_synthetique_survae

Plots original & synthetic combined

## combine data
combined_df <- rbind(original %>% mutate(group = "original"), survae_factor %>% mutate(group = "synthetic")) %>% mutate(combined_haplotype = str_c(haplotype,"_", group ))

## fit the model
km_combined <- survfit(Surv(delai_event, event) ~ combined_haplotype, data = combined_df)

# plot
ggsurvplot(fit = km_combined, 
           data = combined_df,
           
  size = 1,                 # change line size
  conf.int = FALSE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.35, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

Graphical exploration of distribution

library(GGally)

pm_survae <- combined_df %>% select(haplotype:delai_event, group) %>% ggpairs(
  ggplot2::aes(colour = group,alpha = 0.5),
  upper = list(continuous = wrap("cor", size = 1.5)),
  lower=list(combo=wrap("facethist", binwidth=0.5))) + 
  theme(strip.text.x = element_text(size = 5),
           strip.text.y = element_text(size = 5),axis.text = element_text(size = 5))
pm_survae

# ggsave("comparaison_distribution_survae.pdf")

Evaluaiton of variability interdataset tvae

# Définir le répertoire où se trouvent les fichiers
repertoire <- "~/Documents/avatar/tvae_ctgan_variability/Gen_data_synth_for_bootstrap/Generate_graft_loss3_multi/Graft_loss_survae"

# Lire tous les fichiers CSV dans le répertoire
liste_donnees <- list.files(repertoire, pattern = "*.dat", full.names = TRUE) %>%
  map(read_csv)
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Appliquer le modèle de Cox à chaque jeu de données
resultats <- map(liste_donnees, ~ coxph(Surv(delai_event, event) ~ haplotype + rejet_aigu, data = .x))

# Extraire les HR et les quantiles pour chaque variable
quantiles <- c(0, 5, 25, 50, 75, 95, 100)

hr_haplotype <- map(resultats, ~ tidy(.x, exponentiate = TRUE)) %>%
  map_dfr(~ .x %>% filter(term == "haplotype") %>% select(estimate)) %>%
  reframe(across(estimate, ~ quantile(., probs = quantiles/100))) %>% 
  mutate(quantiles = c(0, 5, 25, 50, 75, 95, 100), name = "haplotype")


hr_rejet_aigu <- map(resultats, ~ tidy(.x, exponentiate = TRUE)) %>%
  map_dfr(~ .x %>% filter(term == "rejet_aigu") %>% select(estimate)) %>%
  reframe(across(estimate, ~ quantile(., probs = quantiles/100))) %>% 
  mutate(quantiles = c(0, 5, 25, 50, 75, 95, 100), name = "rejet_aigu")

# Afficher et combiner les résultats
hr_results_tvae <- bind_rows(hr_haplotype, hr_rejet_aigu)
# Print summary statistics
knitr::kable(hr_results_tvae, "simple")
estimate quantiles name
2.097934e-01 0 haplotype
6.379875e-01 5 haplotype
1.243284e+00 25 haplotype
1.912866e+00 50 haplotype
2.641539e+00 75 haplotype
7.438773e+00 95 haplotype
1.959189e+09 100 haplotype
0.000000e+00 0 rejet_aigu
0.000000e+00 5 rejet_aigu
8.129948e-01 25 rejet_aigu
1.344378e+00 50 rejet_aigu
2.359905e+00 75 rejet_aigu
5.321860e+00 95 rejet_aigu
1.209022e+09 100 rejet_aigu

Survtvae Augmented data

load the data

survae_augmented <- read_csv("sfpt24_survae_data_large_v240111.dat") %>% 
   select(haplotype:delai_event)
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
survae_augmented_factor <- survae_augmented %>% 
  mutate(haplotype = as.factor(haplotype),
         cyp3A5D = as.factor(cyp3A5D),
         sexe_r = as.factor(sexe_r),
         sexe_d = as.factor(sexe_d),
         # CYP3A4_1B = as.factor(CYP3A4_1B),
         # MDR1_C1236T = as.factor(MDR1_C1236T),
         # MDR1_G2677T = as.factor(MDR1_G2677T),
         # MDR1_C3435T = as.factor(MDR1_C3435T),
         rejet_aigu = as.factor(rejet_aigu))

Comparison of the datasets

Summary of the 2 datasets

summary(original)
##  haplotype   cyp3A5D       age_r       sexe_r      age_d       sexe_d 
##  autre: 97   Es : 42   Min.   :19.00   F: 97   Min.   :12.00   F: 79  
##  het  :123   NEs:211   1st Qu.:44.00   M:156   1st Qu.:25.00   M:174  
##  hom  : 33             Median :55.00           Median :40.00          
##                        Mean   :53.84           Mean   :38.49          
##                        3rd Qu.:64.00           3rd Qu.:49.00          
##                        Max.   :78.00           Max.   :73.00          
##  rejet_aigu      TIF           event          delai_event    
##  0:172      Min.   : 303   Min.   :0.00000   Min.   : 0.680  
##  1: 81      1st Qu.: 975   1st Qu.:0.00000   1st Qu.: 2.920  
##             Median :1153   Median :0.00000   Median : 5.340  
##             Mean   :1199   Mean   :0.08696   Mean   : 6.044  
##             3rd Qu.:1368   3rd Qu.:0.00000   3rd Qu.: 8.700  
##             Max.   :2580   Max.   :1.00000   Max.   :15.830
summary(survae_augmented)
##    haplotype        cyp3A5D          age_r           sexe_r     
##  Min.   :1.000   Min.   :1.000   Min.   :24.00   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:43.00   1st Qu.:1.000  
##  Median :2.000   Median :2.000   Median :56.00   Median :2.000  
##  Mean   :1.862   Mean   :1.958   Mean   :53.64   Mean   :1.639  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:63.00   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :2.000   Max.   :78.00   Max.   :2.000  
##      age_d           sexe_d        rejet_aigu         TIF      
##  Min.   :12.00   Min.   :1.000   Min.   :1.000   Min.   : 565  
##  1st Qu.:21.00   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 925  
##  Median :34.00   Median :2.000   Median :1.000   Median :1058  
##  Mean   :33.44   Mean   :1.781   Mean   :1.286   Mean   :1070  
##  3rd Qu.:44.00   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1195  
##  Max.   :66.00   Max.   :2.000   Max.   :2.000   Max.   :2191  
##      event          delai_event     
##  Min.   :0.00000   Min.   : 0.8223  
##  1st Qu.:0.00000   1st Qu.: 4.5201  
##  Median :0.00000   Median : 6.2705  
##  Mean   :0.05237   Mean   : 6.3985  
##  3rd Qu.:0.00000   3rd Qu.: 7.9969  
##  Max.   :1.00000   Max.   :15.7025
# Combine original and synthetic data for visualization
combined_data <- rbind(
  original1 %>% mutate(DataType = 'Original'),
  survae_augmented_factor %>% mutate(DataType = 'Synthetic')
) %>% mutate_if(is.character, factor)
## Vector of categorical variables that need transformation
catVars <- c("haplotype", "cyp3A5D",  "sexe_r",  "sexe_d", 
"rejet_aigu", "event")
## Create a variable list.
vars <- c( "haplotype", "cyp3A5D", "age_r", "sexe_r", "age_d", "sexe_d", 
"rejet_aigu", "TIF", "event", "delai_event", "DataType")
tableOne <- CreateTableOne(vars = vars, strata = "DataType",factorVars = catVars, data = combined_data)
tableOne2<-print(tableOne, nonnormal = c( "age_r", "age_d", "TIF", "delai_event"), printToggle=F, minMax=T)
Original Synthetic p test
n 253 1012
haplotype (%) 0.020
1 97 (38.3) 296 ( 29.2)
2 123 (48.6) 560 ( 55.3)
3 33 (13.0) 156 ( 15.4)
cyp3A5D = 2 (%) 211 (83.4) 969 ( 95.8) <0.001
age_r (median [range]) 55.00 [19.00, 78.00] 56.00 [24.00, 78.00] 0.888 nonnorm
sexe_r = 2 (%) 156 (61.7) 647 ( 63.9) 0.549
age_d (median [range]) 40.00 [12.00, 73.00] 34.00 [12.00, 66.00] <0.001 nonnorm
sexe_d = 2 (%) 174 (68.8) 790 ( 78.1) 0.003
rejet_aigu = 2 (%) 81 (32.0) 289 ( 28.6) 0.315
TIF (median [range]) 1153.00 [303.00, 2580.00] 1058.00 [565.00, 2191.00] <0.001 nonnorm
event = 1 (%) 22 ( 8.7) 53 ( 5.2) 0.053
delai_event (median [range]) 5.34 [0.68, 15.83] 6.27 [0.82, 15.70] 0.002 nonnorm
DataType = Synthetic (%) 0 ( 0.0) 1012 (100.0) <0.001

individual data explorer

# boxplots
plot_boxplot(combined_data , by ="DataType") 

# histograms

# Function to create histogram for each continuous variable
plot_histograms <- function(data, var_name, group_var) {
  ggplot(data, aes(x = !!sym(var_name), fill = !!sym(group_var))) +
    geom_histogram(alpha = 0.5,show.legend = FALSE) +
    labs(x = var_name, y = "Count") +
    theme_minimal() +
    ggtitle(paste(var_name))
}

# Using select_if to identify continuous variables and map to apply the function
plots <- combined_data %>%
  select( -sexe_r,-sexe_d) %>% 
  select_if(is.numeric) %>%
  names() %>%
  map(~plot_histograms(combined_data, ., "DataType"))

# Optionally, print or arrange plots (e.g., using gridExtra or patchwork packages)

wrap_plots(plots)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot correlation

##Correlation Analysis
  cor_real <- cor(original1, use = "complete.obs")
  cor_synthetic <- cor(survae_augmented, use = "complete.obs")
  
# plots
ggcorrplot(cor_real, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

# plots
ggcorrplot(cor_synthetic, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

Modele de Cox

# original
fit_original <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = original1)
## 
##   n= 253, number of events= 22 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.2112071  3.3575352  0.3463273  3.497  0.00047 ***
## cyp3A5D    -1.2323909  0.2915946  0.5567303 -2.214  0.02685 *  
## age_r      -0.0039521  0.9960557  0.0187880 -0.210  0.83339    
## sexe_r     -0.0438849  0.9570641  0.4606422 -0.095  0.92410    
## age_d       0.0360206  1.0366772  0.0203668  1.769  0.07696 .  
## sexe_d      0.2786636  1.3213627  0.5181402  0.538  0.59070    
## rejet_aigu  1.0124644  2.7523756  0.4804379  2.107  0.03508 *  
## TIF        -0.0002268  0.9997732  0.0005753 -0.394  0.69345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.3575     0.2978   1.70305    6.6193
## cyp3A5D       0.2916     3.4294   0.09792    0.8683
## age_r         0.9961     1.0040   0.96004    1.0334
## sexe_r        0.9571     1.0449   0.38801    2.3607
## age_d         1.0367     0.9646   0.99611    1.0789
## sexe_d        1.3214     0.7568   0.47861    3.6481
## rejet_aigu    2.7524     0.3633   1.07339    7.0576
## TIF           0.9998     1.0002   0.99865    1.0009
## 
## Concordance= 0.758  (se = 0.054 )
## Likelihood ratio test= 24.6  on 8 df,   p=0.002
## Wald test            = 21.09  on 8 df,   p=0.007
## Score (logrank) test = 25.84  on 8 df,   p=0.001
ggforest(fit_original, data = original)

# synthetique
fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = survae_augmented)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = survae_augmented)
## 
##   n= 1012, number of events= 53 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   7.989e-01  2.223e+00  2.264e-01  3.528 0.000419 ***
## cyp3A5D    -1.329e+00  2.647e-01  4.163e-01 -3.192 0.001411 ** 
## age_r      -1.650e-02  9.836e-01  1.250e-02 -1.320 0.186694    
## sexe_r     -3.120e-01  7.319e-01  2.897e-01 -1.077 0.281371    
## age_d       2.036e-02  1.021e+00  1.472e-02  1.383 0.166705    
## sexe_d     -4.312e-01  6.497e-01  3.228e-01 -1.336 0.181657    
## rejet_aigu  3.180e-01  1.374e+00  2.911e-01  1.092 0.274689    
## TIF        -4.687e-05  1.000e+00  7.057e-04 -0.066 0.947046    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     2.2231     0.4498    1.4263    3.4650
## cyp3A5D       0.2647     3.7779    0.1170    0.5986
## age_r         0.9836     1.0166    0.9598    1.0080
## sexe_r        0.7319     1.3662    0.4149    1.2914
## age_d         1.0206     0.9798    0.9915    1.0505
## sexe_d        0.6497     1.5391    0.3451    1.2233
## rejet_aigu    1.3744     0.7276    0.7768    2.4316
## TIF           1.0000     1.0000    0.9986    1.0013
## 
## Concordance= 0.717  (se = 0.038 )
## Likelihood ratio test= 34.77  on 8 df,   p=3e-05
## Wald test            = 40.5  on 8 df,   p=3e-06
## Score (logrank) test = 46.37  on 8 df,   p=2e-07
ggforest(fit_synthetique)

BootstepAIC synhtetic augmented survae

boot.stepAIC(fit_synthetique, survae_augmented, B = 100, k=log(nrow(survae_augmented)))
## 
## Summary of Bootstrapping the 'stepAIC()' procedure for
## 
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = survae_augmented)
## 
## Bootstrap samples: 100 
## Direction: backward 
## Penalty: 6.92 * df
## 
## Covariates selected
##            (%)
## haplotype   80
## cyp3A5D     65
## age_d       20
## rejet_aigu  11
## age_r       10
## sexe_d      10
## sexe_r       8
## Null         1
## TIF          1
## 
## Coefficients Sign
##            + (%) - (%)
## age_d        100     0
## haplotype    100     0
## rejet_aigu   100     0
## TIF          100     0
## age_r          0   100
## cyp3A5D        0   100
## sexe_d         0   100
## sexe_r         0   100
## 
## Stat Significance
##            (%)
## age_d      100
## age_r      100
## cyp3A5D    100
## haplotype  100
## rejet_aigu 100
## sexe_d     100
## sexe_r     100
## TIF        100
## 
## 
## The stepAIC() for the original data-set gave
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D, 
##     data = survae_augmented)
## 
##              coef exp(coef) se(coef)      z        p
## haplotype  0.7674    2.1542   0.2187  3.510 0.000449
## cyp3A5D   -1.6254    0.1968   0.3899 -4.169 3.07e-05
## 
## Likelihood ratio test=25.91  on 2 df, p=2.37e-06
## n= 1012, number of events= 53 
## 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_r + sexe_r + 
##     age_d + sexe_d + rejet_aigu + TIF
## 
## Final Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D
## 
## 
##           Step Df    Deviance Resid. Df Resid. Dev      AIC
## 1                                    45  -34.77283 609.6722
## 2        - TIF  1 0.004421406        46  -34.76841 602.7569
## 3     - sexe_r  1 1.153922732        47  -33.61449 596.9912
## 4 - rejet_aigu  1 1.362272126        48  -32.25221 591.4338
## 5      - age_r  1 1.652731892        49  -30.59948 586.1668
## 6      - age_d  1 1.819471847        50  -28.78001 581.0666
## 7     - sexe_d  1 2.874402337        51  -25.90561 577.0213

Final model original

fit_original <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D, 
##     data = original1)
## 
##   n= 253, number of events= 22 
## 
##              coef exp(coef) se(coef)      z Pr(>|z|)    
## haplotype  1.1992    3.3174   0.3298  3.636 0.000277 ***
## cyp3A5D   -0.8160    0.4422   0.5266 -1.550 0.121251    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## haplotype    3.3174     0.3014    1.7380     6.332
## cyp3A5D      0.4422     2.2614    0.1575     1.241
## 
## Concordance= 0.711  (se = 0.051 )
## Likelihood ratio test= 16.1  on 2 df,   p=3e-04
## Wald test            = 15.71  on 2 df,   p=4e-04
## Score (logrank) test = 17.43  on 2 df,   p=2e-04

Final model synthetic

fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D  , data = survae_augmented)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D, 
##     data = survae_augmented)
## 
##   n= 1012, number of events= 53 
## 
##              coef exp(coef) se(coef)      z Pr(>|z|)    
## haplotype  0.7674    2.1542   0.2187  3.510 0.000449 ***
## cyp3A5D   -1.6254    0.1968   0.3899 -4.169 3.07e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## haplotype    2.1542     0.4642   1.40333    3.3069
## cyp3A5D      0.1968     5.0807   0.09166    0.4227
## 
## Concordance= 0.666  (se = 0.034 )
## Likelihood ratio test= 25.91  on 2 df,   p=2e-06
## Wald test            = 31.51  on 2 df,   p=1e-07
## Score (logrank) test = 37.41  on 2 df,   p=8e-09

Bootstrap of the coefficient for haplotype

Allow to define the variability range of HR for a given dataset (intra dataset variability)

# Define the Cox model
cox_model <- function(data, indices) {
  d <- data[indices,] # allows bootstrapping to sample the data
  fit <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D , data=d)
  return(fit$coefficients)
}
# Set the seed for reproducibility
set.seed(12)

# Bootstrap the Cox model
boot_results <- boot(data=survae_augmented, statistic=cox_model, R=100)

# Calculate summary statistics

# Convert bootstrap results to a data frame for ggplot2
boot_hrs <- exp(boot_results$t) # Convert log(HR) to HR
hr_data_haplo <- data.frame(HR=boot_hrs[,1])
hr_data_cyp3A5D <- data.frame(HR=boot_hrs[,2])

# Calculate summary statistics
summary_stats <- quantile(hr_data_haplo$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) %>%  
  bind_rows(quantile(hr_data_cyp3A5D$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) )
names(summary_stats) <- c("Min","2.5th", "5th", "25th", "Median", "75th", "95th","97.5th","Max")

# Create the histogram
ggplot(hr_data_haplo, aes(x=HR)) +
  geom_histogram(bins=30, fill="#007a86", color="black") +
  # # geom_vline(aes(xintercept=summary_stats["Min"]), color="red", linetype="dashed") +
  # geom_vline(aes(xintercept=summary_stats["25th"][[1]][[1]]), color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Median"][[1]][[1]]), color="blue", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["75th"])[[1]][[1]], color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Max"]), color="purple", linetype="dashed") +
  labs(title="Bootstrap Distribution of Hazard Ratios", x="Hazard Ratio (HR)", y="Frequency") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

# Print summary statistics
knitr::kable(summary_stats, "simple")
Min 2.5th 5th 25th Median 75th 95th 97.5th Max
1.4951627 1.7074889 1.7441759 1.9988662 2.1810754 2.5054622 3.1842282 3.3852138 3.545362
0.0884964 0.1042443 0.1096196 0.1664911 0.2070725 0.2645651 0.3945493 0.4388291 1.388868

Modele final & KM

km_original <- survfit(Surv(delai_event, event) ~ haplotype, data = original)
ggsurvplot(
  km_original,
  data = original,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

km_synthetique <- survfit(Surv(delai_event, event) ~ haplotype, data = survae_augmented_factor)
km_synthetique_survae_augmented <- ggsurvplot(
  km_synthetique,
  data = survae_augmented_factor,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)
km_synthetique_survae_augmented

Plots original & synthetic combined

## combine data
combined_df <- rbind(original %>% mutate(group = "original"), survae_augmented_factor %>% mutate(group = "synthetic")) %>% mutate(combined_haplotype = str_c(haplotype,"_", group ))

## fit the model
km_combined <- survfit(Surv(delai_event, event) ~ combined_haplotype, data = combined_df)

# plot
ggsurvplot(fit = km_combined, 
           data = combined_df,
           
  size = 1,                 # change line size
  conf.int = FALSE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.35, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

Graphical exploration of distribution

library(GGally)

pm_survae_augmented <- combined_df %>% select(haplotype:delai_event, group) %>% ggpairs(
  ggplot2::aes(colour = group,alpha = 0.5),
  upper = list(continuous = wrap("cor", size = 1.5)),
  lower=list(combo=wrap("facethist", binwidth=0.5))) + 
  theme(strip.text.x = element_text(size = 5),
           strip.text.y = element_text(size = 5),axis.text = element_text(size = 5))
pm_survae_augmented

# ggsave("comparaison_distribution_survae_augmented.pdf")

Evaluation of variability interdataset augmented tvae

# Définir le répertoire où se trouvent les fichiers
repertoire <- "~/Documents/avatar/tvae_ctgan_variability/Gen_data_synth_for_bootstrap/Generate_graft_loss3_multi/Graft_loss_survae_large"

# Lire tous les fichiers CSV dans le répertoire
liste_donnees <- list.files(repertoire, pattern = "*.dat", full.names = TRUE) %>%
  map(read_csv)
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Appliquer le modèle de Cox à chaque jeu de données
resultats <- map(liste_donnees, ~ coxph(Surv(delai_event, event) ~ haplotype +  cyp3A5D, data = .x))

# Extraire les HR et les quantiles pour chaque variable
quantiles <- c(0, 5, 25, 50, 75, 95, 100)

hr_haplotype <- map(resultats, ~ tidy(.x, exponentiate = TRUE)) %>%
  map_dfr(~ .x %>% filter(term == "haplotype") %>% select(estimate)) %>%
  reframe(across(estimate, ~ quantile(., probs = quantiles/100))) %>% 
  mutate(quantiles = c(0, 5, 25, 50, 75, 95, 100), name = "haplotype")


hr_cyp3A5D <- map(resultats, ~ tidy(.x, exponentiate = TRUE)) %>%
  map_dfr(~ .x %>% filter(term == "cyp3A5D") %>% select(estimate)) %>%
  reframe(across(estimate, ~ quantile(., probs = quantiles/100))) %>% 
  mutate(quantiles = c(0, 5, 25, 50, 75, 95, 100), name = "cyp3A5D")


# Afficher et combiner les résultats
hr_results_augmented_tvae <-bind_rows(hr_haplotype, hr_cyp3A5D)
# Print summary statistics
knitr::kable(hr_results_augmented_tvae, "simple")
estimate quantiles name
7.257720e-01 0 haplotype
9.756248e-01 5 haplotype
1.520401e+00 25 haplotype
1.953639e+00 50 haplotype
2.428046e+00 75 haplotype
3.550429e+00 95 haplotype
5.606424e+00 100 haplotype
1.317255e-01 0 cyp3A5D
2.197592e-01 5 cyp3A5D
3.530481e-01 25 cyp3A5D
6.635241e-01 50 cyp3A5D
1.103966e+00 75 cyp3A5D
2.304465e+07 95 cyp3A5D
2.929345e+07 100 cyp3A5D

Survival Tabular Generative Adversial Network (CTGAN)

These data have been generated by Clement Benoist using the Synthcity python libnrary from the Van der Schaar lab

Non augmented data

load the data

survctgan <- read_csv("sfpt24_survctgan_data_v240111.dat") %>% 
   select(haplotype:delai_event) 
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
survctgan_factor <- survctgan %>% 
  mutate(haplotype = as.factor(haplotype),
         cyp3A5D = as.factor(cyp3A5D),
         sexe_r = as.factor(sexe_r),
         sexe_d = as.factor(sexe_d),
         # CYP3A4_1B = as.factor(CYP3A4_1B),
         # MDR1_C1236T = as.factor(MDR1_C1236T),
         # MDR1_G2677T = as.factor(MDR1_G2677T),
         # MDR1_C3435T = as.factor(MDR1_C3435T),
         rejet_aigu = as.factor(rejet_aigu))

Comparison of the datasets

Summary of the 2 datasets

summary(original)
##  haplotype   cyp3A5D       age_r       sexe_r      age_d       sexe_d 
##  autre: 97   Es : 42   Min.   :19.00   F: 97   Min.   :12.00   F: 79  
##  het  :123   NEs:211   1st Qu.:44.00   M:156   1st Qu.:25.00   M:174  
##  hom  : 33             Median :55.00           Median :40.00          
##                        Mean   :53.84           Mean   :38.49          
##                        3rd Qu.:64.00           3rd Qu.:49.00          
##                        Max.   :78.00           Max.   :73.00          
##  rejet_aigu      TIF           event          delai_event    
##  0:172      Min.   : 303   Min.   :0.00000   Min.   : 0.680  
##  1: 81      1st Qu.: 975   1st Qu.:0.00000   1st Qu.: 2.920  
##             Median :1153   Median :0.00000   Median : 5.340  
##             Mean   :1199   Mean   :0.08696   Mean   : 6.044  
##             3rd Qu.:1368   3rd Qu.:0.00000   3rd Qu.: 8.700  
##             Max.   :2580   Max.   :1.00000   Max.   :15.830
summary(survctgan)
##    haplotype        cyp3A5D          age_r           sexe_r     
##  Min.   :1.000   Min.   :1.000   Min.   :36.00   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:54.00   1st Qu.:1.000  
##  Median :2.000   Median :2.000   Median :68.00   Median :2.000  
##  Mean   :1.953   Mean   :1.913   Mean   :64.31   Mean   :1.672  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:75.00   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :2.000   Max.   :78.00   Max.   :2.000  
##      age_d           sexe_d        rejet_aigu         TIF      
##  Min.   :19.00   Min.   :1.000   Min.   :1.000   Min.   : 597  
##  1st Qu.:28.00   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 893  
##  Median :38.00   Median :2.000   Median :1.000   Median :1051  
##  Mean   :41.07   Mean   :1.755   Mean   :1.316   Mean   :1066  
##  3rd Qu.:54.00   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1203  
##  Max.   :71.00   Max.   :2.000   Max.   :2.000   Max.   :2115  
##      event         delai_event     
##  Min.   :0.0000   Min.   : 0.9167  
##  1st Qu.:0.0000   1st Qu.: 3.3024  
##  Median :0.0000   Median : 5.7500  
##  Mean   :0.1067   Mean   : 5.6764  
##  3rd Qu.:0.0000   3rd Qu.: 7.5203  
##  Max.   :1.0000   Max.   :14.0767
# Combine original and synthetic data for visualization
combined_data <- rbind(
  original1 %>% mutate(DataType = 'Original'),
  survctgan_factor %>% mutate(DataType = 'Synthetic')
) %>% mutate_if(is.character, factor)
## Vector of categorical variables that need transformation
catVars <- c("haplotype", "cyp3A5D",  "sexe_r",  "sexe_d", 
"rejet_aigu", "event")
## Create a variable list.
vars <- c( "haplotype", "cyp3A5D", "age_r", "sexe_r", "age_d", "sexe_d", 
"rejet_aigu", "TIF", "event", "delai_event", "DataType")
tableOne <- CreateTableOne(vars = vars, strata = "DataType",factorVars = catVars, data = combined_data)
tableOne2<-print(tableOne, nonnormal = c( "age_r", "age_d", "TIF", "delai_event"), printToggle=F, minMax=T)
Original Synthetic p test
n 253 253
haplotype (%) 0.003
1 97 (38.3) 64 ( 25.3)
2 123 (48.6) 137 ( 54.2)
3 33 (13.0) 52 ( 20.6)
cyp3A5D = 2 (%) 211 (83.4) 231 ( 91.3) 0.011
age_r (median [range]) 55.00 [19.00, 78.00] 68.00 [36.00, 78.00] <0.001 nonnorm
sexe_r = 2 (%) 156 (61.7) 170 ( 67.2) 0.227
age_d (median [range]) 40.00 [12.00, 73.00] 38.00 [19.00, 71.00] 0.023 nonnorm
sexe_d = 2 (%) 174 (68.8) 191 ( 75.5) 0.113
rejet_aigu = 2 (%) 81 (32.0) 80 ( 31.6) 1.000
TIF (median [range]) 1153.00 [303.00, 2580.00] 1051.00 [597.00, 2115.00] <0.001 nonnorm
event = 1 (%) 22 ( 8.7) 27 ( 10.7) 0.548
delai_event (median [range]) 5.34 [0.68, 15.83] 5.75 [0.92, 14.08] 0.958 nonnorm
DataType = Synthetic (%) 0 ( 0.0) 253 (100.0) <0.001

individual data explorer

# boxplots
plot_boxplot(combined_data , by ="DataType") 

# histograms

# Function to create histogram for each continuous variable
plot_histograms <- function(data, var_name, group_var) {
  ggplot(data, aes(x = !!sym(var_name), fill = !!sym(group_var))) +
    geom_histogram(alpha = 0.5,show.legend = FALSE) +
    labs(x = var_name, y = "Count") +
    theme_minimal() +
    ggtitle(paste(var_name))
}

# Using select_if to identify continuous variables and map to apply the function
plots <- combined_data %>%
  select( -sexe_r,-sexe_d) %>% 
  select_if(is.numeric) %>%
  names() %>%
  map(~plot_histograms(combined_data, ., "DataType"))

# Optionally, print or arrange plots (e.g., using gridExtra or patchwork packages)

wrap_plots(plots)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot correlation

##Correlation Analysis
  cor_real <- cor(original1, use = "complete.obs")
  cor_synthetic <- cor(survctgan, use = "complete.obs")
  
# plots
ggcorrplot(cor_real, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

# plots
ggcorrplot(cor_synthetic, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

Modele de Cox

# original
fit_original <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = original1)
## 
##   n= 253, number of events= 22 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.2112071  3.3575352  0.3463273  3.497  0.00047 ***
## cyp3A5D    -1.2323909  0.2915946  0.5567303 -2.214  0.02685 *  
## age_r      -0.0039521  0.9960557  0.0187880 -0.210  0.83339    
## sexe_r     -0.0438849  0.9570641  0.4606422 -0.095  0.92410    
## age_d       0.0360206  1.0366772  0.0203668  1.769  0.07696 .  
## sexe_d      0.2786636  1.3213627  0.5181402  0.538  0.59070    
## rejet_aigu  1.0124644  2.7523756  0.4804379  2.107  0.03508 *  
## TIF        -0.0002268  0.9997732  0.0005753 -0.394  0.69345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.3575     0.2978   1.70305    6.6193
## cyp3A5D       0.2916     3.4294   0.09792    0.8683
## age_r         0.9961     1.0040   0.96004    1.0334
## sexe_r        0.9571     1.0449   0.38801    2.3607
## age_d         1.0367     0.9646   0.99611    1.0789
## sexe_d        1.3214     0.7568   0.47861    3.6481
## rejet_aigu    2.7524     0.3633   1.07339    7.0576
## TIF           0.9998     1.0002   0.99865    1.0009
## 
## Concordance= 0.758  (se = 0.054 )
## Likelihood ratio test= 24.6  on 8 df,   p=0.002
## Wald test            = 21.09  on 8 df,   p=0.007
## Score (logrank) test = 25.84  on 8 df,   p=0.001
ggforest(fit_original)

# synthetique
fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = survctgan)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = survctgan)
## 
##   n= 253, number of events= 27 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.2786307  3.5917181  0.3238779  3.948 7.88e-05 ***
## cyp3A5D    -0.7519075  0.4714664  0.7914843 -0.950 0.342114    
## age_r      -0.0452639  0.9557453  0.0201697 -2.244 0.024823 *  
## sexe_r      0.1131043  1.1197487  0.5025305  0.225 0.821925    
## age_d       0.0685335  1.0709365  0.0203790  3.363 0.000771 ***
## sexe_d      0.0766062  1.0796168  0.5159812  0.148 0.881974    
## rejet_aigu  0.1334569  1.1427720  0.4511890  0.296 0.767391    
## TIF         0.0009676  1.0009681  0.0008226  1.176 0.239506    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.5917     0.2784   1.90378    6.7762
## cyp3A5D       0.4715     2.1210   0.09994    2.2241
## age_r         0.9557     1.0463   0.91870    0.9943
## sexe_r        1.1197     0.8931   0.41818    2.9983
## age_d         1.0709     0.9338   1.02900    1.1146
## sexe_d        1.0796     0.9263   0.39270    2.9681
## rejet_aigu    1.1428     0.8751   0.47196    2.7670
## TIF           1.0010     0.9990   0.99936    1.0026
## 
## Concordance= 0.853  (se = 0.031 )
## Likelihood ratio test= 34.62  on 8 df,   p=3e-05
## Wald test            = 31.03  on 8 df,   p=1e-04
## Score (logrank) test = 35.96  on 8 df,   p=2e-05
ggforest(fit_synthetique)

BootstepAIC synhtetic ctGAN

boot.stepAIC(fit_synthetique, survctgan, B = 100, k=log(nrow(survctgan)))
## 
## Summary of Bootstrapping the 'stepAIC()' procedure for
## 
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = survctgan)
## 
## Bootstrap samples: 100 
## Direction: backward 
## Penalty: 5.53 * df
## 
## Covariates selected
##            (%)
## haplotype   99
## age_d       77
## age_r       54
## cyp3A5D     20
## TIF         15
## sexe_d      12
## rejet_aigu   5
## sexe_r       1
## 
## Coefficients Sign
##             + (%)  - (%)
## age_d      100.00   0.00
## haplotype  100.00   0.00
## sexe_r     100.00   0.00
## TIF         93.33   6.67
## rejet_aigu  60.00  40.00
## sexe_d      50.00  50.00
## cyp3A5D     10.00  90.00
## age_r        0.00 100.00
## 
## Stat Significance
##            (%)
## age_d      100
## age_r      100
## haplotype  100
## rejet_aigu 100
## sexe_d     100
## sexe_r     100
## TIF        100
## cyp3A5D     95
## 
## 
## The stepAIC() for the original data-set gave
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + age_r + 
##     age_d, data = survctgan)
## 
##               coef exp(coef) se(coef)      z        p
## haplotype  1.29286   3.64319  0.32963  3.922 8.78e-05
## age_r     -0.04451   0.95646  0.01767 -2.520 0.011748
## age_d      0.06816   1.07054  0.01991  3.424 0.000617
## 
## Likelihood ratio test=32.16  on 3 df, p=4.85e-07
## n= 253, number of events= 27 
## 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_r + sexe_r + 
##     age_d + sexe_d + rejet_aigu + TIF
## 
## Final Model:
## Surv(delai_event, event) ~ haplotype + age_r + age_d
## 
## 
##           Step Df   Deviance Resid. Df Resid. Dev      AIC
## 1                                   19  -34.62360 225.9449
## 2     - sexe_d  1 0.02220719        20  -34.60139 220.4337
## 3     - sexe_r  1 0.05105628        21  -34.55033 214.9514
## 4 - rejet_aigu  1 0.07853818        22  -34.47179 209.4965
## 5    - cyp3A5D  1 0.72079825        23  -33.75100 204.6839
## 6        - TIF  1 1.59409108        24  -32.15691 200.7446

Final model original

fit_original <- coxph(Surv(delai_event, event) ~ haplotype +    rejet_aigu , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + rejet_aigu, 
##     data = original1)
## 
##   n= 253, number of events= 22 
## 
##              coef exp(coef) se(coef)     z Pr(>|z|)    
## haplotype  1.1681    3.2160   0.3261 3.582 0.000341 ***
## rejet_aigu 0.9238    2.5188   0.4661 1.982 0.047482 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype      3.216     0.3109     1.697     6.094
## rejet_aigu     2.519     0.3970     1.010     6.280
## 
## Concordance= 0.732  (se = 0.05 )
## Likelihood ratio test= 18.24  on 2 df,   p=1e-04
## Wald test            = 17.44  on 2 df,   p=2e-04
## Score (logrank) test = 19.29  on 2 df,   p=6e-05

Final model synthetic

fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + age_r+ age_d , data = survctgan)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + age_r + 
##     age_d, data = survctgan)
## 
##   n= 253, number of events= 27 
## 
##               coef exp(coef) se(coef)      z Pr(>|z|)    
## haplotype  1.29286   3.64319  0.32963  3.922 8.78e-05 ***
## age_r     -0.04451   0.95646  0.01767 -2.520 0.011748 *  
## age_d      0.06816   1.07054  0.01991  3.424 0.000617 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## haplotype    3.6432     0.2745    1.9094    6.9512
## age_r        0.9565     1.0455    0.9239    0.9902
## age_d        1.0705     0.9341    1.0296    1.1131
## 
## Concordance= 0.823  (se = 0.041 )
## Likelihood ratio test= 32.16  on 3 df,   p=5e-07
## Wald test            = 29.2  on 3 df,   p=2e-06
## Score (logrank) test = 33.33  on 3 df,   p=3e-07

Bootstrap of the coefficient for haplotype

Allow to define the variability range of HR for a given dataset (intra dataset variability)

# Define the Cox model
cox_model <- function(data, indices) {
  d <- data[indices,] # allows bootstrapping to sample the data
  fit <- coxph(Surv(delai_event, event) ~ haplotype + age_r+ age_d  , data=d)
  return(fit$coefficients)
}
# Set the seed for reproducibility
set.seed(12)

# Bootstrap the Cox model
boot_results <- boot(data=survctgan, statistic=cox_model, R=100)


# Convert bootstrap results to a data frame for ggplot2
boot_hrs <- exp(boot_results$t) # Convert log(HR) to HR
hr_data_haplo <- data.frame(HR=boot_hrs[,1])
hr_data_age_r <- data.frame(HR=boot_hrs[,2])
hr_data_age_d <- data.frame(HR=boot_hrs[,3])

# Calculate summary statistics
summary_stats <- quantile(hr_data_haplo$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) %>%  
  bind_rows(quantile(hr_data_age_r$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) ) %>% 
    bind_rows(quantile(hr_data_age_d$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) )
names(summary_stats) <- c("Min","2.5th", "5th", "25th", "Median", "75th", "95th","97.5th","Max")



# Create the histogram
ggplot(hr_data_haplo, aes(x=HR)) +
  geom_histogram(bins=30, fill="#007a86", color="black") +
  # # geom_vline(aes(xintercept=summary_stats["Min"]), color="red", linetype="dashed") +
  # geom_vline(aes(xintercept=summary_stats["25th"][[1]][[1]]), color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Median"][[1]][[1]]), color="blue", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["75th"])[[1]][[1]], color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Max"]), color="purple", linetype="dashed") +
  labs(title="Bootstrap Distribution of Hazard Ratios", x="Hazard Ratio (HR)", y="Frequency") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

# Print summary statistics
knitr::kable(summary_stats, "simple")
Min 2.5th 5th 25th Median 75th 95th 97.5th Max
1.9598241 2.1502791 2.3740937 3.2491720 4.4621210 6.1153594 11.491354 14.2115888 26.5526342
0.9127347 0.9180034 0.9225346 0.9386579 0.9515207 0.9615791 0.979446 0.9811417 0.9880432
1.0163727 1.0259490 1.0306551 1.0529889 1.0655245 1.0829997 1.101860 1.1040577 1.1096139

Modele final & KM

km_original <- survfit(Surv(delai_event, event) ~ haplotype, data = original)
ggsurvplot(
  km_original,
  data = original,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

km_synthetique <- survfit(Surv(delai_event, event) ~ haplotype, data = survctgan_factor)
km_synthetique_ctgan <- ggsurvplot(
  km_synthetique,
  data = survctgan_factor,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)
km_synthetique_ctgan

Plots original & synthetic combined

## combine data
combined_df <- rbind(original %>% mutate(group = "original"), survctgan_factor %>% mutate(group = "synthetic")) %>% mutate(combined_haplotype = str_c(haplotype,"_", group ))

## fit the model
km_combined <- survfit(Surv(delai_event, event) ~ combined_haplotype, data = combined_df)

# plot
ggsurvplot(fit = km_combined, 
           data = combined_df,
           
  size = 1,                 # change line size
  conf.int = FALSE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.35, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

Graphical exploraiotn of distribution

library(GGally)

pm_ctgan <- combined_df %>% select(haplotype:delai_event, group) %>% ggpairs(
  ggplot2::aes(colour = group,alpha = 0.5),
  upper = list(continuous = wrap("cor", size = 1.5)),
  lower=list(combo=wrap("facethist", binwidth=0.5))) + 
  theme(strip.text.x = element_text(size = 5),
           strip.text.y = element_text(size = 5),axis.text = element_text(size = 5))
pm_ctgan

# ggsave("comparaison_distribution_survctgan.pdf")

Evaluation of variability interdataset ctgan

# Définir le répertoire où se trouvent les fichiers
repertoire <- "~/Documents/avatar/tvae_ctgan_variability/Gen_data_synth_for_bootstrap/Generate_graft_loss3_multi/Graft_loss_surv_ctgan"

# Lire tous les fichiers CSV dans le répertoire
liste_donnees <- list.files(repertoire, pattern = "*.dat", full.names = TRUE) %>%
  map(read_csv)
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 253 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Appliquer le modèle de Cox à chaque jeu de données
resultats <- map(liste_donnees, ~ coxph(Surv(delai_event, event) ~ haplotype + age_r+ age_d , data = .x))

# Extraire les HR et les quantiles pour chaque variable
quantiles <- c(0, 5, 25, 50, 75, 95, 100)

hr_haplotype <- map(resultats, ~ tidy(.x, exponentiate = TRUE)) %>%
  map_dfr(~ .x %>% filter(term == "haplotype") %>% select(estimate)) %>%
  reframe(across(estimate, ~ quantile(., probs = quantiles/100))) %>% 
  mutate(quantiles = c(0, 5, 25, 50, 75, 95, 100), name = "haplotype")


hr_age_r <- map(resultats, ~ tidy(.x, exponentiate = TRUE)) %>%
  map_dfr(~ .x %>% filter(term == "age_r") %>% select(estimate)) %>%
  reframe(across(estimate, ~ quantile(., probs = quantiles/100))) %>% 
  mutate(quantiles = c(0, 5, 25, 50, 75, 95, 100), name = "age_r")

hr_age_d <- map(resultats, ~ tidy(.x, exponentiate = TRUE)) %>%
  map_dfr(~ .x %>% filter(term == "age_d") %>% select(estimate)) %>%
  reframe(across(estimate, ~ quantile(., probs = quantiles/100))) %>% 
  mutate(quantiles = c(0, 5, 25, 50, 75, 95, 100), name = "age_d")

# Afficher et combiner les résultats
hr_results_ctgan <- bind_rows(hr_haplotype, hr_age_r,hr_age_d)
# Print summary statistics
knitr::kable(hr_results_ctgan, "simple")
estimate quantiles name
0.8235224 0 haplotype
1.2077354 5 haplotype
1.9582998 25 haplotype
2.5582018 50 haplotype
3.6106090 75 haplotype
6.6765699 95 haplotype
27.5221584 100 haplotype
0.8925233 0 age_r
0.9286848 5 age_r
0.9597140 25 age_r
0.9810758 50 age_r
0.9977011 75 age_r
1.0243994 95 age_r
1.0620212 100 age_r
0.9093644 0 age_d
0.9707534 5 age_d
1.0114126 25 age_d
1.0378943 50 age_d
1.0654632 75 age_d
1.1153437 95 age_d
1.1613914 100 age_d

Augmented data

load the data

survctgan_augmented <- read_csv("sfpt24_survctgan_data_large_v240111.dat") %>% 
   select(haplotype:delai_event)
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
survctgan_augmented_factor <- survctgan_augmented %>% 
  mutate(haplotype = as.factor(haplotype),
         cyp3A5D = as.factor(cyp3A5D),
         sexe_r = as.factor(sexe_r),
         sexe_d = as.factor(sexe_d),
         # CYP3A4_1B = as.factor(CYP3A4_1B),
         # MDR1_C1236T = as.factor(MDR1_C1236T),
         # MDR1_G2677T = as.factor(MDR1_G2677T),
         # MDR1_C3435T = as.factor(MDR1_C3435T),
         rejet_aigu = as.factor(rejet_aigu))

Comparison of the datasets

Summary of the 2 datasets

summary(original)
##  haplotype   cyp3A5D       age_r       sexe_r      age_d       sexe_d 
##  autre: 97   Es : 42   Min.   :19.00   F: 97   Min.   :12.00   F: 79  
##  het  :123   NEs:211   1st Qu.:44.00   M:156   1st Qu.:25.00   M:174  
##  hom  : 33             Median :55.00           Median :40.00          
##                        Mean   :53.84           Mean   :38.49          
##                        3rd Qu.:64.00           3rd Qu.:49.00          
##                        Max.   :78.00           Max.   :73.00          
##  rejet_aigu      TIF           event          delai_event    
##  0:172      Min.   : 303   Min.   :0.00000   Min.   : 0.680  
##  1: 81      1st Qu.: 975   1st Qu.:0.00000   1st Qu.: 2.920  
##             Median :1153   Median :0.00000   Median : 5.340  
##             Mean   :1199   Mean   :0.08696   Mean   : 6.044  
##             3rd Qu.:1368   3rd Qu.:0.00000   3rd Qu.: 8.700  
##             Max.   :2580   Max.   :1.00000   Max.   :15.830
summary(survctgan_augmented)
##    haplotype        cyp3A5D         age_r           sexe_r          age_d      
##  Min.   :1.000   Min.   :1.00   Min.   :19.00   Min.   :1.000   Min.   :12.00  
##  1st Qu.:1.000   1st Qu.:2.00   1st Qu.:40.00   1st Qu.:1.000   1st Qu.:21.00  
##  Median :2.000   Median :2.00   Median :55.00   Median :2.000   Median :30.00  
##  Mean   :1.845   Mean   :1.83   Mean   :51.89   Mean   :1.642   Mean   :31.99  
##  3rd Qu.:2.000   3rd Qu.:2.00   3rd Qu.:63.00   3rd Qu.:2.000   3rd Qu.:42.00  
##  Max.   :3.000   Max.   :2.00   Max.   :78.00   Max.   :2.000   Max.   :58.00  
##      sexe_d        rejet_aigu         TIF             event       
##  Min.   :1.000   Min.   :1.000   Min.   : 303.0   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.: 826.8   1st Qu.:0.0000  
##  Median :2.000   Median :1.000   Median : 993.0   Median :0.0000  
##  Mean   :1.713   Mean   :1.327   Mean   :1020.8   Mean   :0.1354  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1174.8   3rd Qu.:0.0000  
##  Max.   :2.000   Max.   :2.000   Max.   :2580.0   Max.   :1.0000  
##   delai_event    
##  Min.   : 1.022  
##  1st Qu.: 4.044  
##  Median : 5.876  
##  Mean   : 6.026  
##  3rd Qu.: 7.743  
##  Max.   :15.119
# Combine original and synthetic data for visualization
combined_data <- rbind(
  original1 %>% mutate(DataType = 'Original'),
  survctgan_augmented_factor %>% mutate(DataType = 'Synthetic')
) %>% mutate_if(is.character, factor)
## Vector of categorical variables that need transformation
catVars <- c("haplotype", "cyp3A5D",  "sexe_r",  "sexe_d", 
"rejet_aigu", "event")
## Create a variable list.
vars <- c( "haplotype", "cyp3A5D", "age_r", "sexe_r", "age_d", "sexe_d", 
"rejet_aigu", "TIF", "event", "delai_event", "DataType")
tableOne <- CreateTableOne(vars = vars, strata = "DataType",factorVars = catVars, data = combined_data)
tableOne2<-print(tableOne, nonnormal = c( "age_r", "age_d", "TIF", "delai_event"), printToggle=F, minMax=T)
Original Synthetic p test
n 253 1012
haplotype (%) 0.108
1 97 (38.3) 342 ( 33.8)
2 123 (48.6) 485 ( 47.9)
3 33 (13.0) 185 ( 18.3)
cyp3A5D = 2 (%) 211 (83.4) 840 ( 83.0) 0.955
age_r (median [range]) 55.00 [19.00, 78.00] 55.00 [19.00, 78.00] 0.119 nonnorm
sexe_r = 2 (%) 156 (61.7) 650 ( 64.2) 0.492
age_d (median [range]) 40.00 [12.00, 73.00] 30.00 [12.00, 58.00] <0.001 nonnorm
sexe_d = 2 (%) 174 (68.8) 722 ( 71.3) 0.467
rejet_aigu = 2 (%) 81 (32.0) 331 ( 32.7) 0.893
TIF (median [range]) 1153.00 [303.00, 2580.00] 993.00 [303.00, 2580.00] <0.001 nonnorm
event = 1 (%) 22 ( 8.7) 137 ( 13.5) 0.049
delai_event (median [range]) 5.34 [0.68, 15.83] 5.88 [1.02, 15.12] 0.102 nonnorm
DataType = Synthetic (%) 0 ( 0.0) 1012 (100.0) <0.001

individual data explorer

# boxplots
plot_boxplot(combined_data , by ="DataType") 

# histograms

# Function to create histogram for each continuous variable
plot_histograms <- function(data, var_name, group_var) {
  ggplot(data, aes(x = !!sym(var_name), fill = !!sym(group_var))) +
    geom_histogram(alpha = 0.5,show.legend = FALSE) +
    labs(x = var_name, y = "Count") +
    theme_minimal() +
    ggtitle(paste(var_name))
}

# Using select_if to identify continuous variables and map to apply the function
plots <- combined_data %>%
  select( -sexe_r,-sexe_d) %>% 
  select_if(is.numeric) %>%
  names() %>%
  map(~plot_histograms(combined_data, ., "DataType"))

# Optionally, print or arrange plots (e.g., using gridExtra or patchwork packages)

wrap_plots(plots)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot correlation

##Correlation Analysis
  cor_real <- cor(original1, use = "complete.obs")
  cor_synthetic <- cor(survctgan_augmented, use = "complete.obs")
  
# plots
ggcorrplot(cor_real, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

# plots
ggcorrplot(cor_synthetic, hc.order = TRUE, type = "lower",
           lab = TRUE,  pch.cex = 5,
  tl.cex = 6, lab_size = 2)

Modele de Cox

# original
fit_original <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = original1)
## 
##   n= 253, number of events= 22 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   1.2112071  3.3575352  0.3463273  3.497  0.00047 ***
## cyp3A5D    -1.2323909  0.2915946  0.5567303 -2.214  0.02685 *  
## age_r      -0.0039521  0.9960557  0.0187880 -0.210  0.83339    
## sexe_r     -0.0438849  0.9570641  0.4606422 -0.095  0.92410    
## age_d       0.0360206  1.0366772  0.0203668  1.769  0.07696 .  
## sexe_d      0.2786636  1.3213627  0.5181402  0.538  0.59070    
## rejet_aigu  1.0124644  2.7523756  0.4804379  2.107  0.03508 *  
## TIF        -0.0002268  0.9997732  0.0005753 -0.394  0.69345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     3.3575     0.2978   1.70305    6.6193
## cyp3A5D       0.2916     3.4294   0.09792    0.8683
## age_r         0.9961     1.0040   0.96004    1.0334
## sexe_r        0.9571     1.0449   0.38801    2.3607
## age_d         1.0367     0.9646   0.99611    1.0789
## sexe_d        1.3214     0.7568   0.47861    3.6481
## rejet_aigu    2.7524     0.3633   1.07339    7.0576
## TIF           0.9998     1.0002   0.99865    1.0009
## 
## Concordance= 0.758  (se = 0.054 )
## Likelihood ratio test= 24.6  on 8 df,   p=0.002
## Wald test            = 21.09  on 8 df,   p=0.007
## Score (logrank) test = 25.84  on 8 df,   p=0.001
ggforest(fit_original)

# synthetique
fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D +  age_r + sexe_r + age_d +  sexe_d + rejet_aigu + TIF , data = survctgan_augmented)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = survctgan_augmented)
## 
##   n= 1012, number of events= 137 
## 
##                  coef  exp(coef)   se(coef)      z Pr(>|z|)    
## haplotype   7.812e-01  2.184e+00  1.330e-01  5.876 4.21e-09 ***
## cyp3A5D    -1.336e+00  2.628e-01  1.940e-01 -6.889 5.61e-12 ***
## age_r      -2.002e-02  9.802e-01  7.835e-03 -2.555  0.01063 *  
## sexe_r     -3.167e-01  7.285e-01  1.927e-01 -1.644  0.10020    
## age_d       6.554e-02  1.068e+00  9.929e-03  6.601 4.07e-11 ***
## sexe_d      2.623e-01  1.300e+00  1.921e-01  1.366  0.17205    
## rejet_aigu  5.571e-01  1.746e+00  1.902e-01  2.928  0.00341 ** 
## TIF         6.576e-05  1.000e+00  2.986e-04  0.220  0.82568    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype     2.1841     0.4579    1.6831    2.8343
## cyp3A5D       0.2628     3.8055    0.1797    0.3843
## age_r         0.9802     1.0202    0.9652    0.9954
## sexe_r        0.7285     1.3726    0.4994    1.0628
## age_d         1.0677     0.9366    1.0472    1.0887
## sexe_d        1.2999     0.7693    0.8921    1.8942
## rejet_aigu    1.7456     0.5729    1.2023    2.5344
## TIF           1.0001     0.9999    0.9995    1.0007
## 
## Concordance= 0.8  (se = 0.022 )
## Likelihood ratio test= 198.1  on 8 df,   p=<2e-16
## Wald test            = 187.2  on 8 df,   p=<2e-16
## Score (logrank) test = 240.2  on 8 df,   p=<2e-16
ggforest(fit_synthetique)

BootstepAIC synhtetic augmented ctGAN

boot.stepAIC(fit_synthetique, survctgan_augmented, B = 100, k=log(nrow(survctgan_augmented)))
## 
## Summary of Bootstrapping the 'stepAIC()' procedure for
## 
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_r + sexe_r + age_d + sexe_d + rejet_aigu + TIF, data = survctgan_augmented)
## 
## Bootstrap samples: 100 
## Direction: backward 
## Penalty: 6.92 * df
## 
## Covariates selected
##            (%)
## age_d      100
## cyp3A5D    100
## haplotype  100
## rejet_aigu  60
## age_r       35
## sexe_d      11
## sexe_r       7
## 
## Coefficients Sign
##            + (%) - (%)
## age_d        100     0
## haplotype    100     0
## rejet_aigu   100     0
## sexe_d       100     0
## age_r          0   100
## cyp3A5D        0   100
## sexe_r         0   100
## 
## Stat Significance
##            (%)
## age_d      100
## age_r      100
## cyp3A5D    100
## haplotype  100
## rejet_aigu 100
## sexe_d     100
## sexe_r     100
## 
## 
## The stepAIC() for the original data-set gave
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_d, data = survctgan_augmented)
## 
##                coef exp(coef)  se(coef)      z        p
## haplotype  0.801802  2.229554  0.123137  6.511 7.44e-11
## cyp3A5D   -1.437209  0.237590  0.182105 -7.892 2.97e-15
## age_d      0.060010  1.061847  0.009173  6.542 6.08e-11
## 
## Likelihood ratio test=181.3  on 3 df, p=< 2.2e-16
## n= 1012, number of events= 137 
## 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_r + sexe_r + 
##     age_d + sexe_d + rejet_aigu + TIF
## 
## Final Model:
## Surv(delai_event, event) ~ haplotype + cyp3A5D + age_d
## 
## 
##           Step Df   Deviance Resid. Df Resid. Dev      AIC
## 1                                  129  -198.0495 1466.889
## 2        - TIF  1 0.04805559       130  -198.0015 1460.017
## 3     - sexe_d  1 1.88958696       131  -196.1119 1454.987
## 4     - sexe_r  1 2.13631707       132  -193.9756 1450.204
## 5      - age_r  1 5.78800702       133  -188.1876 1449.072
## 6 - rejet_aigu  1 6.83421403       134  -181.3533 1448.986

Final model original

fit_original <- coxph(Surv(delai_event, event) ~ haplotype +    rejet_aigu , data = original1)
summary(fit_original)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + rejet_aigu, 
##     data = original1)
## 
##   n= 253, number of events= 22 
## 
##              coef exp(coef) se(coef)     z Pr(>|z|)    
## haplotype  1.1681    3.2160   0.3261 3.582 0.000341 ***
## rejet_aigu 0.9238    2.5188   0.4661 1.982 0.047482 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## haplotype      3.216     0.3109     1.697     6.094
## rejet_aigu     2.519     0.3970     1.010     6.280
## 
## Concordance= 0.732  (se = 0.05 )
## Likelihood ratio test= 18.24  on 2 df,   p=1e-04
## Wald test            = 17.44  on 2 df,   p=2e-04
## Score (logrank) test = 19.29  on 2 df,   p=6e-05

Final model synthetic

fit_synthetique <- coxph(Surv(delai_event, event) ~ haplotype +  cyp3A5D+ age_d  , data = survctgan_augmented)
summary(fit_synthetique)
## Call:
## coxph(formula = Surv(delai_event, event) ~ haplotype + cyp3A5D + 
##     age_d, data = survctgan_augmented)
## 
##   n= 1012, number of events= 137 
## 
##                coef exp(coef)  se(coef)      z Pr(>|z|)    
## haplotype  0.801802  2.229554  0.123137  6.511 7.44e-11 ***
## cyp3A5D   -1.437209  0.237590  0.182105 -7.892 2.97e-15 ***
## age_d      0.060010  1.061847  0.009173  6.542 6.08e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## haplotype    2.2296     0.4485    1.7515    2.8381
## cyp3A5D      0.2376     4.2089    0.1663    0.3395
## age_d        1.0618     0.9418    1.0429    1.0811
## 
## Concordance= 0.792  (se = 0.022 )
## Likelihood ratio test= 181.3  on 3 df,   p=<2e-16
## Wald test            = 174.4  on 3 df,   p=<2e-16
## Score (logrank) test = 219  on 3 df,   p=<2e-16

Bootstrap of the coefficient for haplotype

Allow to define the variability range of HR for a given dataset (intra dataset variability)

# Define the Cox model
cox_model <- function(data, indices) {
  d <- data[indices,] # allows bootstrapping to sample the data
  fit <- coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D+ age_d  , data=d)
  return(fit$coefficients)
}

# Set the seed for reproducibility
set.seed(12)

# Bootstrap the Cox model
boot_results <- boot(data=survctgan_augmented, statistic=cox_model, R=100)

# Convert bootstrap results to a data frame for ggplot2
boot_hrs <- exp(boot_results$t) # Convert log(HR) to HR
hr_data_haplo <- data.frame(HR=boot_hrs[,1])
hr_data_cyp3A5D <- data.frame(HR=boot_hrs[,2])
hr_data_age_d <- data.frame(HR=boot_hrs[,3])

# Calculate summary statistics
summary_stats <- quantile(hr_data_haplo$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) %>%  
  bind_rows(quantile(hr_data_cyp3A5D$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) ) %>% 
    bind_rows(quantile(hr_data_age_d$HR, probs = c(0, 0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975, 1)) )
names(summary_stats) <- c("Min","2.5th", "5th", "25th", "Median", "75th", "95th","97.5th","Max")


# Create the histogram
ggplot(hr_data_haplo, aes(x=HR)) +
  geom_histogram(bins=30, fill="#007a86", color="black") +
  # # geom_vline(aes(xintercept=summary_stats["Min"]), color="red", linetype="dashed") +
  # geom_vline(aes(xintercept=summary_stats["25th"][[1]][[1]]), color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Median"][[1]][[1]]), color="blue", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["75th"])[[1]][[1]], color="gray", linetype="dashed", linewidth=2) +
  # geom_vline(aes(xintercept=summary_stats["Max"]), color="purple", linetype="dashed") +
  labs(title="Bootstrap Distribution of Hazard Ratios", x="Hazard Ratio (HR)", y="Frequency") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

# Print summary statistics
knitr::kable(summary_stats, "simple")
Min 2.5th 5th 25th Median 75th 95th 97.5th Max
1.6033717 1.7518195 1.8077805 2.1068108 2.2414160 2.4216252 2.7144741 2.8133390 3.0197868
0.1489253 0.1669671 0.1798056 0.2064424 0.2305468 0.2583895 0.3102633 0.3266667 0.4055641
1.0333989 1.0347498 1.0369765 1.0538198 1.0626267 1.0676404 1.0801418 1.0862602 1.0930407

Modele final & KM

km_original <- survfit(Surv(delai_event, event) ~ haplotype, data = original)
ggsurvplot(
  km_original,
  data = original,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

km_synthetique <- survfit(Surv(delai_event, event) ~ haplotype, data = survctgan_augmented_factor)
km_synthetique_ctgan_augmented <- ggsurvplot(
  km_synthetique,
  data = survctgan_augmented_factor,
  size = 1,                 # change line size
  conf.int = TRUE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.25, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)
km_synthetique_ctgan_augmented

Plots original & synthetic combined

## combine data
combined_df <- rbind(original %>% mutate(group = "original"), survctgan_augmented_factor %>% mutate(group = "synthetic")) %>% mutate(combined_haplotype = str_c(haplotype,"_", group ))

## fit the model
km_combined <- survfit(Surv(delai_event, event) ~ combined_haplotype, data = combined_df)

# plot
ggsurvplot(fit = km_combined, 
           data = combined_df,
           
  size = 1,                 # change line size
  conf.int = FALSE,          # Add confidence interval
  pval = TRUE,              # Add p-value
  risk.table = TRUE,        # Add risk table
  risk.table.col = "strata",# Risk table color by groups
  risk.table.height = 0.35, # Useful to change when you have multiple groups
  ggtheme = theme_bw()      # Change ggplot2 theme
)

Graphical exploraiotn of distribution

library(GGally)

pm_ctgan_augmented <- combined_df %>% select(haplotype:delai_event, group) %>% ggpairs(
  ggplot2::aes(colour = group,alpha = 0.5),
  upper = list(continuous = wrap("cor", size = 1.5)),
  lower=list(combo=wrap("facethist", binwidth=0.5))) + 
  theme(strip.text.x = element_text(size = 5),
           strip.text.y = element_text(size = 5),axis.text = element_text(size = 5))
pm_ctgan_augmented

# ggsave("comparaison_distribution_survctgan_augmented.pdf")

Evaluation of variability interdataset augmented ctgan

# Définir le répertoire où se trouvent les fichiers
repertoire <- "~/Documents/avatar/tvae_ctgan_variability/Gen_data_synth_for_bootstrap/Generate_graft_loss3_multi/Graft_loss_surv_ctgan_large"

# Lire tous les fichiers CSV dans le répertoire
liste_donnees <- list.files(repertoire, pattern = "*.dat", full.names = TRUE) %>%
  map(read_csv)
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1012 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): haplotype, cyp3A5D, age_r, sexe_r, age_d, sexe_d, rejet_aigu, TIF,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Appliquer le modèle de Cox à chaque jeu de données
resultats <- map(liste_donnees, ~ coxph(Surv(delai_event, event) ~ haplotype + cyp3A5D+ age_d , data = .x))

# Extraire les HR et les quantiles pour chaque variable
quantiles <- c(0, 5, 25, 50, 75, 95, 100)

hr_haplotype <- map(resultats, ~ tidy(.x, exponentiate = TRUE)) %>%
  map_dfr(~ .x %>% filter(term == "haplotype") %>% select(estimate)) %>%
  reframe(across(estimate, ~ quantile(., probs = quantiles/100))) %>% 
  mutate(quantiles = c(0, 5, 25, 50, 75, 95, 100), name = "haplotype")


hr_cyp3A5D <- map(resultats, ~ tidy(.x, exponentiate = TRUE)) %>%
  map_dfr(~ .x %>% filter(term == "cyp3A5D") %>% select(estimate)) %>%
  reframe(across(estimate, ~ quantile(., probs = quantiles/100))) %>% 
  mutate(quantiles = c(0, 5, 25, 50, 75, 95, 100), name = "cyp3A5D")

hr_age_d <- map(resultats, ~ tidy(.x, exponentiate = TRUE)) %>%
  map_dfr(~ .x %>% filter(term == "age_d") %>% select(estimate)) %>%
  reframe(across(estimate, ~ quantile(., probs = quantiles/100))) %>% 
  mutate(quantiles = c(0, 5, 25, 50, 75, 95, 100), name = "age_d")

# Afficher et combiner les résultats
hr_results_augmented_ctgan <- bind_rows(hr_haplotype, hr_cyp3A5D,hr_age_d)
# Print summary statistics
knitr::kable(hr_results_augmented_ctgan, "simple")
estimate quantiles name
1.328745e+00 0 haplotype
1.455721e+00 5 haplotype
2.102828e+00 25 haplotype
2.558735e+00 50 haplotype
3.258177e+00 75 haplotype
4.819013e+00 95 haplotype
5.614200e+00 100 haplotype
9.628560e-02 0 cyp3A5D
1.944780e-01 5 cyp3A5D
3.290273e-01 25 cyp3A5D
5.433777e-01 50 cyp3A5D
8.007847e-01 75 cyp3A5D
1.773390e+00 95 cyp3A5D
7.977806e+06 100 cyp3A5D
9.450976e-01 0 age_d
9.962434e-01 5 age_d
1.013176e+00 25 age_d
1.034718e+00 50 age_d
1.059836e+00 75 age_d
1.085749e+00 95 age_d
1.115233e+00 100 age_d

Tableone summary all data

combined_data <- bind_rows(original1 %>% mutate(DataType = 'Original', haplotype = as.factor(haplotype), cyp3A5D = as.factor(cyp3A5D), sexe_r = as.factor(sexe_r), sexe_d = as.factor(sexe_d),rejet_aigu = as.factor(rejet_aigu),event = as.factor(event)),
                           avatars_tibble_knn5 %>% mutate(DataType = 'knn5', haplotype = as.factor(haplotype), cyp3A5D = as.factor(cyp3A5D), sexe_r = as.factor(sexe_r), sexe_d = as.factor(sexe_d),rejet_aigu = as.factor(rejet_aigu),event = as.factor(event)),
                           avatars_tibble_knn20 %>% mutate(DataType = 'knn20', haplotype = as.factor(haplotype), cyp3A5D = as.factor(cyp3A5D), sexe_r = as.factor(sexe_r), sexe_d = as.factor(sexe_d),rejet_aigu = as.factor(rejet_aigu),event = as.factor(event)),
                           avatars_tibble_knn10 %>% mutate(DataType = 'knn10', haplotype = as.factor(haplotype), cyp3A5D = as.factor(cyp3A5D), sexe_r = as.factor(sexe_r), sexe_d = as.factor(sexe_d),rejet_aigu = as.factor(rejet_aigu),event = as.factor(event)),
                           augmented_data_5 %>% mutate(DataType = 'augmented_knn5', haplotype = as.factor(haplotype), cyp3A5D = as.factor(cyp3A5D), sexe_r = as.factor(sexe_r), sexe_d = as.factor(sexe_d),rejet_aigu = as.factor(rejet_aigu),event = as.factor(event)),
                           augmented_data_20 %>% mutate(DataType = 'augmented_knn20', haplotype = as.factor(haplotype), cyp3A5D = as.factor(cyp3A5D), sexe_r = as.factor(sexe_r), sexe_d = as.factor(sexe_d),rejet_aigu = as.factor(rejet_aigu),event = as.factor(event)),
                           augmented_data_10 %>% mutate(DataType = 'augmented_knn10', haplotype = as.factor(haplotype), cyp3A5D = as.factor(cyp3A5D), sexe_r = as.factor(sexe_r), sexe_d = as.factor(sexe_d),rejet_aigu = as.factor(rejet_aigu),event = as.factor(event)),
                           survae_factor %>% mutate(DataType = 'survae', haplotype = as.factor(haplotype), cyp3A5D = as.factor(cyp3A5D), sexe_r = as.factor(sexe_r), sexe_d = as.factor(sexe_d),rejet_aigu = as.factor(rejet_aigu),event = as.factor(event)),
                           survae_augmented_factor %>% mutate(DataType = 'augmented_survae', haplotype = as.factor(haplotype), cyp3A5D = as.factor(cyp3A5D), sexe_r = as.factor(sexe_r), sexe_d = as.factor(sexe_d),rejet_aigu = as.factor(rejet_aigu),event = as.factor(event)),
                           survctgan_factor %>% mutate(DataType = 'ctgan', haplotype = as.factor(haplotype), cyp3A5D = as.factor(cyp3A5D), sexe_r = as.factor(sexe_r), sexe_d = as.factor(sexe_d),rejet_aigu = as.factor(rejet_aigu),event = as.factor(event)),
                           survctgan_augmented_factor %>% mutate(DataType = 'augmented_ctgan', haplotype = as.factor(haplotype), cyp3A5D = as.factor(cyp3A5D), sexe_r = as.factor(sexe_r), sexe_d = as.factor(sexe_d),rejet_aigu = as.factor(rejet_aigu),event = as.factor(event))
                           )



## Vector of categorical variables that need transformation
catVars <- c("haplotype", "cyp3A5D",  "sexe_r",  "sexe_d", 
             "rejet_aigu", "event")
## Create a variable list.
vars <- c( "haplotype", "cyp3A5D", "age_r", "sexe_r", "age_d", "sexe_d", 
           "rejet_aigu", "TIF", "event", "delai_event", "DataType")
tableOne <- CreateTableOne(vars = vars, strata = "DataType",factorVars = catVars, data = combined_data)
tableOne2<-print(tableOne, nonnormal = c( "age_r", "age_d", "TIF", "delai_event"), printToggle=F, minMax=T)
kableone(tableOne2)
augmented_ctgan augmented_knn10 augmented_knn20 augmented_knn5 augmented_survae ctgan knn10 knn20 knn5 Original survae p test
n 1012 1012 1012 1012 1012 253 253 253 253 253 253
haplotype (%) <0.001
1 342 ( 33.8) 356 ( 35.2) 345 ( 34.1) 373 ( 36.9) 296 ( 29.2) 64 ( 25.3) 93 ( 36.8) 79 ( 31.2) 93 ( 36.8) 97 ( 38.3) 71 ( 28.1)
2 485 ( 47.9) 602 ( 59.5) 614 ( 60.7) 577 ( 57.0) 560 ( 55.3) 137 ( 54.2) 146 ( 57.7) 159 ( 62.8) 144 ( 56.9) 123 ( 48.6) 146 ( 57.7)
3 185 ( 18.3) 54 ( 5.3) 53 ( 5.2) 62 ( 6.1) 156 ( 15.4) 52 ( 20.6) 14 ( 5.5) 15 ( 5.9) 16 ( 6.3) 33 ( 13.0) 36 ( 14.2)
cyp3A5D = 2 (%) 840 ( 83.0) 880 ( 87.0) 909 ( 89.8) 859 ( 84.9) 969 ( 95.8) 231 ( 91.3) 219 ( 86.6) 224 ( 88.5) 217 ( 85.8) 211 ( 83.4) 238 ( 94.1) <0.001
age_r (median [range]) 55.00 [19.00, 78.00] 54.41 [23.53, 75.26] 55.60 [24.90, 75.77] 55.42 [23.08, 77.36] 56.00 [24.00, 78.00] 68.00 [36.00, 78.00] 54.21 [24.23, 73.48] 55.12 [25.46, 74.57] 55.09 [24.19, 73.74] 55.00 [19.00, 78.00] 56.00 [20.00, 78.00] <0.001 nonnorm
sexe_r = 2 (%) 650 ( 64.2) 666 ( 65.8) 700 ( 69.2) 656 ( 64.8) 647 ( 63.9) 170 ( 67.2) 165 ( 65.2) 178 ( 70.4) 163 ( 64.4) 156 ( 61.7) 163 ( 64.4) 0.225
age_d (median [range]) 30.00 [12.00, 58.00] 39.92 [15.46, 63.56] 39.73 [16.63, 67.75] 39.96 [15.04, 68.49] 34.00 [12.00, 66.00] 38.00 [19.00, 71.00] 39.60 [15.46, 62.80] 39.04 [18.38, 67.75] 39.82 [19.40, 68.49] 40.00 [12.00, 73.00] 34.00 [13.00, 60.00] <0.001 nonnorm
sexe_d = 2 (%) 722 ( 71.3) 743 ( 73.4) 743 ( 73.4) 717 ( 70.8) 790 ( 78.1) 191 ( 75.5) 188 ( 74.3) 188 ( 74.3) 185 ( 73.1) 174 ( 68.8) 197 ( 77.9) 0.011
rejet_aigu = 2 (%) 331 ( 32.7) 276 ( 27.3) 244 ( 24.1) 294 ( 29.1) 289 ( 28.6) 80 ( 31.6) 72 ( 28.5) 63 ( 24.9) 73 ( 28.9) 81 ( 32.0) 74 ( 29.2) 0.009
TIF (median [range]) 993.00 [303.00, 2580.00] 1141.45 [372.32, 2040.25] 1144.15 [576.99, 2091.07] 1158.71 [456.35, 2362.15] 1058.00 [565.00, 2191.00] 1051.00 [597.00, 2115.00] 1156.78 [570.38, 1987.46] 1135.02 [630.81, 2091.07] 1174.55 [456.35, 2362.15] 1153.00 [303.00, 2580.00] 1057.00 [588.00, 1912.00] <0.001 nonnorm
event = 1 (%) 137 ( 13.5) 80 ( 7.9) 55 ( 5.4) 86 ( 8.5) 53 ( 5.2) 27 ( 10.7) 20 ( 7.9) 13 ( 5.1) 21 ( 8.3) 22 ( 8.7) 14 ( 5.5) <0.001
delai_event (median [range]) 5.88 [1.02, 15.12] 5.31 [0.87, 15.33] 5.21 [1.02, 15.10] 5.45 [0.97, 14.94] 6.27 [0.82, 15.70] 5.75 [0.92, 14.08] 5.60 [0.96, 15.33] 5.34 [1.20, 15.10] 5.36 [0.97, 14.94] 5.34 [0.68, 15.83] 6.13 [1.07, 14.88] <0.001 nonnorm
DataType (%) <0.001
augmented_ctgan 1012 (100.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0)
augmented_knn10 0 ( 0.0) 1012 (100.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0)
augmented_knn20 0 ( 0.0) 0 ( 0.0) 1012 (100.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0)
augmented_knn5 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 1012 (100.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0)
augmented_survae 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 1012 (100.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0)
ctgan 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 253 (100.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0)
knn10 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 253 (100.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0)
knn20 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 253 (100.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0)
knn5 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 253 (100.0) 0 ( 0.0) 0 ( 0.0)
Original 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 253 (100.0) 0 ( 0.0)
survae 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 0 ( 0.0) 253 (100.0)

plot all covariation ggpair

library(patchwork)
pm_knn5

ggsave("Figure1.pdf")
## Saving 7 x 5 in image

plot all KM curve haplotype

# List of ggsurvplots
require("survminer")
splots <- list()
splots[[1]] <- km_original_plot
splots[[2]] <- km_synthetique_avatar_5 
splots[[3]] <- km_synthetique_avatar_5_augmented 
splots[[4]] <- km_synthetique_survae 
splots[[5]] <- km_synthetique_survae_augmented 
splots[[6]] <- km_synthetique_ctgan 
splots[[7]] <- km_synthetique_ctgan_augmented 

# Arrange multiple ggsurvplots and print the output
arrange_ggsurvplots(splots, print = TRUE,
  ncol = 1, nrow = 7)#, risk.table.height = 0.4)

if (FALSE) {
# Arrange and save into pdf file
res <- arrange_ggsurvplots(splots, print = FALSE)
ggsave("Figure2.pdf", res)
}

calculation metrics privacy DCR, NNDR

#knn5
metric_avatar_knn5_graft_loss <- read_csv("Metrics_avatar2/metric_avatar_knn5_graft_loss.csv") 
## Rows: 253 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): dcr, nndr
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metric_knn5 <- metric_avatar_knn5_graft_loss %>% 
  summarise(
    across(
      everything(),
      list(
        min = ~ quantile(., probs = 0),
        p5 = ~ quantile(., probs = 0.05),
        p25 = ~ quantile(., probs = 0.25),
        p50 = ~ quantile(., probs = 0.5),
        p75 = ~ quantile(., probs = 0.75),
        p95 = ~ quantile(., probs = 0.95),
        max = ~ quantile(., probs = 1)
      )
    )
  )
knitr::kable(metric_knn5, "simple")
dcr_min dcr_p5 dcr_p25 dcr_p50 dcr_p75 dcr_p95 dcr_max nndr_min nndr_p5 nndr_p25 nndr_p50 nndr_p75 nndr_p95 nndr_max
0.159286 0.2838517 0.4241048 0.5665813 0.742086 1.008724 2.029339 0.0641988 0.1380793 0.2489758 0.3341827 0.5209876 0.7527488 0.924452
#knn5 augmented
metric_avatar_knn5_large_graft_loss <- read_csv("Metrics_avatar2/metric_avatar_knn5_large_graft_loss.csv")
## Rows: 1012 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): dcr, nndr
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metric_knn5_augmented <- metric_avatar_knn5_large_graft_loss %>% summarise(
    across(
      everything(),
      list(
        min = ~ quantile(., probs = 0),
        p5 = ~ quantile(., probs = 0.05),
        p25 = ~ quantile(., probs = 0.25),
        p50 = ~ quantile(., probs = 0.5),
        p75 = ~ quantile(., probs = 0.75),
        p95 = ~ quantile(., probs = 0.95),
        max = ~ quantile(., probs = 1)
      )
    )
  )
knitr::kable(metric_knn5_augmented, "simple")
dcr_min dcr_p5 dcr_p25 dcr_p50 dcr_p75 dcr_p95 dcr_max nndr_min nndr_p5 nndr_p25 nndr_p50 nndr_p75 nndr_p95 nndr_max
0.0262085 0.0891968 0.199533 0.4231659 0.7846543 1.506673 2.486024 0.0191986 0.0503164 0.1179117 0.267012 0.5982074 0.9433168 0.9981841
#survae
metric_survae_graft_loss <- read_csv("Metrics_avatar2/metric_survae_graft_loss.csv") 
## Rows: 253 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): dcr, nndr
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metric_survae <- metric_survae_graft_loss %>% 
  summarise(
    across(
      everything(),
      list(
        min = ~ quantile(., probs = 0),
        p5 = ~ quantile(., probs = 0.05),
        p25 = ~ quantile(., probs = 0.25),
        p50 = ~ quantile(., probs = 0.5),
        p75 = ~ quantile(., probs = 0.75),
        p95 = ~ quantile(., probs = 0.95),
        max = ~ quantile(., probs = 1)
      )
    )
  )
knitr::kable(metric_survae, "simple")
dcr_min dcr_p5 dcr_p25 dcr_p50 dcr_p75 dcr_p95 dcr_max nndr_min nndr_p5 nndr_p25 nndr_p50 nndr_p75 nndr_p95 nndr_max
0.4054311 0.8699173 1.551997 1.906325 2.389682 2.995105 3.468345 0.2303528 0.5463846 0.7706567 0.8789318 0.9440272 0.9877799 0.9990694
#survae augmented
metric_survae_large_graft_loss <- read_csv("Metrics_avatar2/metric_survae_large_graft_loss.csv") 
## Rows: 1012 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): dcr, nndr
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metric_survae_augmented <- metric_survae_large_graft_loss %>% 
   summarise(
    across(
      everything(),
      list(
        min = ~ quantile(., probs = 0),
        p5 = ~ quantile(., probs = 0.05),
        p25 = ~ quantile(., probs = 0.25),
        p50 = ~ quantile(., probs = 0.5),
        p75 = ~ quantile(., probs = 0.75),
        p95 = ~ quantile(., probs = 0.95),
        max = ~ quantile(., probs = 1)
      )
    )
  )
knitr::kable(metric_survae_augmented, "simple")
dcr_min dcr_p5 dcr_p25 dcr_p50 dcr_p75 dcr_p95 dcr_max nndr_min nndr_p5 nndr_p25 nndr_p50 nndr_p75 nndr_p95 nndr_max
0.3015096 0.8650204 1.537663 1.938557 2.418375 3.083595 4.294902 0.1337203 0.544702 0.7858148 0.8997433 0.9570271 0.9939075 0.9999053
#ctgan
metric_survctgan_graft_loss <- read_csv("Metrics_avatar2/metric_survctgan_graft_loss.csv") 
## Rows: 253 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): dcr, nndr
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metric_ctgan <- metric_survctgan_graft_loss %>% 
     summarise(
    across(
      everything(),
      list(
        min = ~ quantile(., probs = 0),
        p5 = ~ quantile(., probs = 0.05),
        p25 = ~ quantile(., probs = 0.25),
        p50 = ~ quantile(., probs = 0.5),
        p75 = ~ quantile(., probs = 0.75),
        p95 = ~ quantile(., probs = 0.95),
        max = ~ quantile(., probs = 1)
      )
    )
  )
knitr::kable(metric_ctgan, "simple")
dcr_min dcr_p5 dcr_p25 dcr_p50 dcr_p75 dcr_p95 dcr_max nndr_min nndr_p5 nndr_p25 nndr_p50 nndr_p75 nndr_p95 nndr_max
0.511736 0.8324347 1.279238 1.781562 2.418493 3.395527 5.59313 0.2788541 0.5188259 0.7430643 0.8675757 0.9483103 0.9925245 0.999383
#ctgan augmented
metric_avatar_survctgan_large_graft_loss <- read_csv("Metrics_avatar2/metric_survctgan_large_graft_loss.csv") 
## Rows: 1012 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): dcr, nndr
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metric_ctgan_augmented <- metric_avatar_survctgan_large_graft_loss %>% 
       summarise(
    across(
      everything(),
      list(
        min = ~ quantile(., probs = 0),
        p5 = ~ quantile(., probs = 0.05),
        p25 = ~ quantile(., probs = 0.25),
        p50 = ~ quantile(., probs = 0.5),
        p75 = ~ quantile(., probs = 0.75),
        p95 = ~ quantile(., probs = 0.95),
        max = ~ quantile(., probs = 1)
      )
    )
  )
knitr::kable(metric_ctgan_augmented, "simple")
dcr_min dcr_p5 dcr_p25 dcr_p50 dcr_p75 dcr_p95 dcr_max nndr_min nndr_p5 nndr_p25 nndr_p50 nndr_p75 nndr_p95 nndr_max
0.2531389 0.8154637 1.335732 1.868256 2.592339 3.62116 5.117563 0.2075465 0.5173092 0.753877 0.8699735 0.9498682 0.9916846 0.9998274

plot of the metrics distribution

metrics_plot <- metric_avatar_knn5_graft_loss %>% mutate(type = "knn5") %>% 
  bind_rows(metric_avatar_knn5_large_graft_loss %>% mutate(type = "knn5_augmented")) %>% 
  bind_rows(metric_survae_graft_loss %>% mutate(type = "survae")) %>% 
  bind_rows(metric_survae_large_graft_loss %>% mutate(type = "survae_augmented")) %>% 
  bind_rows(metric_survctgan_graft_loss %>% mutate(type = "survctgan")) %>% 
  bind_rows(metric_avatar_survctgan_large_graft_loss %>% mutate(type = "survctgan_augmented")) 

# dcr
ggplot(metrics_plot, aes(x = dcr, fill = type, color = type, alpha = 0.5)) +
  geom_density(adjust = 1.5) +
  scale_alpha_identity() +
  labs(title = "DCR density Distribution by Group",
       x = "DCR",
       y = "Density",
       fill = "Group",
       color = "Group") +
  theme_minimal() +
  theme(legend.position = "right")

# nndr
ggplot(metrics_plot, aes(x = nndr, fill = type, color = type, alpha = 0.5)) +
  geom_density(adjust = 1.5) +
  scale_alpha_identity() +
  labs(title = "NNDR density distribution per group",
       x = "NNDR",
       y = "Density",
       fill = "Group",
       color = "Group") +
  theme_minimal() +
  theme(legend.position = "right")